|
|
import csv |
|
|
import pathlib |
|
|
import random |
|
|
|
|
|
import click |
|
|
import librosa |
|
|
import numpy as np |
|
|
import soundfile |
|
|
import tqdm |
|
|
from textgrid import TextGrid |
|
|
|
|
|
|
|
|
@click.command(help='Collect phoneme alignments into transcriptions.csv') |
|
|
@click.option('--wavs', required=True, help='Path to the segments directory') |
|
|
@click.option('--tg', required=True, help='Path to the final TextGrids directory') |
|
|
@click.option('--dataset', required=True, help='Path to dataset directory') |
|
|
@click.option('--skip_silence_insertion', is_flag=True, show_default=True, |
|
|
help='Do not insert silence around segments') |
|
|
@click.option('--wav_subtype', default="PCM_16", show_default=True, |
|
|
help='WAV subtype') |
|
|
def build_dataset(wavs, tg, dataset, skip_silence_insertion, wav_subtype): |
|
|
wavs = pathlib.Path(wavs) |
|
|
tg_dir = pathlib.Path(tg) |
|
|
del tg |
|
|
dataset = pathlib.Path(dataset) |
|
|
filelist = list(wavs.glob('*.wav')) |
|
|
|
|
|
dataset.mkdir(parents=True, exist_ok=True) |
|
|
(dataset / 'wavs').mkdir(exist_ok=True) |
|
|
transcriptions = [] |
|
|
samplerate = 44100 |
|
|
min_sil = int(0.1 * samplerate) |
|
|
max_sil = int(0.5 * samplerate) |
|
|
for wavfile in tqdm.tqdm(filelist): |
|
|
y, _ = librosa.load(wavfile, sr=samplerate, mono=True) |
|
|
tgfile = tg_dir / wavfile.with_suffix('.TextGrid').name |
|
|
tg = TextGrid() |
|
|
tg.read(str(tgfile)) |
|
|
ph_seq = [ph.mark for ph in tg[1]] |
|
|
ph_dur = [ph.maxTime - ph.minTime for ph in tg[1]] |
|
|
if not skip_silence_insertion: |
|
|
if random.random() < 0.5: |
|
|
len_sil = random.randrange(min_sil, max_sil) |
|
|
y = np.concatenate((np.zeros((len_sil,), dtype=np.float32), y)) |
|
|
if ph_seq[0] == 'SP': |
|
|
ph_dur[0] += len_sil / samplerate |
|
|
else: |
|
|
ph_seq.insert(0, 'SP') |
|
|
ph_dur.insert(0, len_sil / samplerate) |
|
|
if random.random() < 0.5: |
|
|
len_sil = random.randrange(min_sil, max_sil) |
|
|
y = np.concatenate((y, np.zeros((len_sil,), dtype=np.float32))) |
|
|
if ph_seq[-1] == 'SP': |
|
|
ph_dur[-1] += len_sil / samplerate |
|
|
else: |
|
|
ph_seq.append('SP') |
|
|
ph_dur.append(len_sil / samplerate) |
|
|
ph_seq = ' '.join(ph_seq) |
|
|
ph_dur = ' '.join([str(round(d, 6)) for d in ph_dur]) |
|
|
soundfile.write(dataset / 'wavs' / wavfile.name, y, samplerate, subtype=wav_subtype) |
|
|
transcriptions.append({'name': wavfile.stem, 'ph_seq': ph_seq, 'ph_dur': ph_dur}) |
|
|
|
|
|
with open(dataset / 'transcriptions.csv', 'w', encoding='utf8', newline='') as f: |
|
|
writer = csv.DictWriter(f, fieldnames=['name', 'ph_seq', 'ph_dur']) |
|
|
writer.writeheader() |
|
|
writer.writerows(transcriptions) |
|
|
|
|
|
print(f'All wavs and transcriptions saved in {dataset}') |
|
|
|
|
|
|
|
|
if __name__ == '__main__': |
|
|
build_dataset() |
|
|
|