| | |
| | |
| | |
| | |
| |
|
| | import numpy as np |
| | import os |
| | import tgt |
| |
|
| |
|
| | def get_alignment(tier, cfg): |
| | sample_rate = cfg["sample_rate"] |
| | hop_size = cfg["hop_size"] |
| |
|
| | sil_phones = ["sil", "sp", "spn"] |
| |
|
| | phones = [] |
| | durations = [] |
| | start_time = 0 |
| | end_time = 0 |
| | end_idx = 0 |
| |
|
| | for t in tier._objects: |
| | s, e, p = t.start_time, t.end_time, t.text |
| |
|
| | |
| | if phones == []: |
| | if p in sil_phones: |
| | continue |
| | else: |
| | start_time = s |
| |
|
| | if p not in sil_phones: |
| | |
| | phones.append(p) |
| | end_time = e |
| | end_idx = len(phones) |
| | else: |
| | |
| | phones.append(p) |
| |
|
| | durations.append( |
| | int( |
| | np.round(e * sample_rate / hop_size) |
| | - np.round(s * sample_rate / hop_size) |
| | ) |
| | ) |
| |
|
| | |
| | phones = phones[:end_idx] |
| | durations = durations[:end_idx] |
| |
|
| | return phones, durations, start_time, end_time |
| |
|
| |
|
| | def get_duration(utt, wav, cfg): |
| | speaker = utt["Singer"] |
| | basename = utt["Uid"] |
| | dataset = utt["Dataset"] |
| | sample_rate = cfg["sample_rate"] |
| |
|
| | |
| | wav_path = os.path.join( |
| | cfg.processed_dir, dataset, "raw_data", speaker, "{}.wav".format(basename) |
| | ) |
| | text_path = os.path.join( |
| | cfg.processed_dir, dataset, "raw_data", speaker, "{}.lab".format(basename) |
| | ) |
| | tg_path = os.path.join( |
| | cfg.processed_dir, dataset, "TextGrid", speaker, "{}.TextGrid".format(basename) |
| | ) |
| |
|
| | |
| | with open(text_path, "r") as f: |
| | raw_text = f.readline().strip("\n") |
| |
|
| | |
| | textgrid = tgt.io.read_textgrid(tg_path) |
| | phone, duration, start, end = get_alignment( |
| | textgrid.get_tier_by_name("phones"), cfg |
| | ) |
| | text = "{" + " ".join(phone) + "}" |
| | if start >= end: |
| | return None |
| |
|
| | return duration, text, int(sample_rate * start), int(sample_rate * end) |
| |
|