ameerazam08's picture
Upload folder using huggingface_hub
79cf5f5 verified
import pathlib
import click
import librosa
import soundfile
import textgrid
import tqdm
@click.command(help='Slice 3-tier TextGrids and long recordings into segmented 2-tier TextGrids and wavs')
@click.option(
'--wavs', required=True,
help='Directory containing the segmented wav files'
)
@click.option(
'--tg', required=False,
help='Directory containing the segmented TextGrid files (defaults to wav directory)'
)
@click.option(
'--out', required=True,
help='Path to output directory for combined files'
)
@click.option(
'--preserve_sentence_names', is_flag=True,
help='Whether to use sentence marks as filenames (will be re-numbered by default)'
)
@click.option(
'--digits', required=False, type=int, default=3,
help='Number of suffix digits (defaults to 3, will be padded with zeros on the left)'
)
@click.option(
'--wav_subtype', required=False, default='PCM_16',
help='Wav subtype (defaults to PCM_16)'
)
@click.option(
'--overwrite', is_flag=True,
help='Overwrite existing files'
)
def slice_tg(wavs, tg, out, preserve_sentence_names, digits, wav_subtype, overwrite):
wav_path_in = pathlib.Path(wavs)
tg_path_in = wav_path_in if tg is None else pathlib.Path(tg)
del tg
sliced_path_out = pathlib.Path(out)
sliced_path_out.mkdir(parents=True, exist_ok=True)
for tg_file in tqdm.tqdm(tg_path_in.glob('*.TextGrid')):
tg = textgrid.TextGrid()
tg.read(tg_file)
wav, sr = librosa.load((wav_path_in / tg_file.name).with_suffix('.wav'), sr=None)
sentences_tier = tg[0]
words_tier = tg[1]
phones_tier = tg[2]
idx = 0
for sentence in sentences_tier:
if sentence.mark == '':
continue
sentence_tg = textgrid.TextGrid()
sentence_words_tier = textgrid.IntervalTier(name='words')
sentence_phones_tier = textgrid.IntervalTier(name='phones')
for word in words_tier:
min_time = max(sentence.minTime, word.minTime)
max_time = min(sentence.maxTime, word.maxTime)
if min_time >= max_time:
continue
sentence_words_tier.add(
minTime=min_time - sentence.minTime, maxTime=max_time - sentence.minTime, mark=word.mark
)
for phone in phones_tier:
min_time = max(sentence.minTime, phone.minTime)
max_time = min(sentence.maxTime, phone.maxTime)
if min_time >= max_time:
continue
sentence_phones_tier.add(
minTime=min_time - sentence.minTime, maxTime=max_time - sentence.minTime, mark=phone.mark
)
sentence_tg.append(sentence_words_tier)
sentence_tg.append(sentence_phones_tier)
if preserve_sentence_names:
tg_file_out = sliced_path_out / f'{sentence.mark}.TextGrid'
wav_file_out = tg_file_out.with_suffix('.wav')
else:
tg_file_out = sliced_path_out / f'{tg_file.stem}_{str(idx).zfill(digits)}.TextGrid'
wav_file_out = tg_file_out.with_suffix('.wav')
if tg_file_out.exists() and not overwrite:
raise FileExistsError(str(tg_file_out))
if wav_file_out.exists() and not overwrite:
raise FileExistsError(str(wav_file_out))
sentence_tg.write(tg_file_out)
sentence_wav = wav[int(sentence.minTime * sr): min(wav.shape[0], int(sentence.maxTime * sr) + 1)]
soundfile.write(
wav_file_out,
sentence_wav, samplerate=sr, subtype=wav_subtype
)
idx += 1
if __name__ == '__main__':
slice_tg()