Comparative-Analysis-of-Speech-Synthesis-Models
/
TensorFlowTTS
/examples
/tacotron2
/export_align.py
| import os | |
| import shutil | |
| from tqdm import tqdm | |
| import argparse | |
| from scipy.ndimage import zoom | |
| from skimage.data import camera | |
| import numpy as np | |
| from scipy.spatial.distance import cdist | |
| def safemkdir(dirn): | |
| if not os.path.isdir(dirn): | |
| os.mkdir(dirn) | |
| from pathlib import Path | |
| def duration_to_alignment(in_duration): | |
| total_len = np.sum(in_duration) | |
| num_chars = len(in_duration) | |
| attention = np.zeros(shape=(num_chars, total_len), dtype=np.float32) | |
| y_offset = 0 | |
| for duration_idx, duration_val in enumerate(in_duration): | |
| for y_val in range(0, duration_val): | |
| attention[duration_idx][y_offset + y_val] = 1.0 | |
| y_offset += duration_val | |
| return attention | |
| def rescale_alignment(in_alignment, in_targcharlen): | |
| current_x = in_alignment.shape[0] | |
| x_ratio = in_targcharlen / current_x | |
| pivot_points = [] | |
| zoomed = zoom(in_alignment, (x_ratio, 1.0), mode="nearest") | |
| for x_v in range(0, zoomed.shape[0]): | |
| for y_v in range(0, zoomed.shape[1]): | |
| val = zoomed[x_v][y_v] | |
| if val < 0.5: | |
| val = 0.0 | |
| else: | |
| val = 1.0 | |
| pivot_points.append((x_v, y_v)) | |
| zoomed[x_v][y_v] = val | |
| if zoomed.shape[0] != in_targcharlen: | |
| print("Zooming didn't rshape well, explicitly reshaping") | |
| zoomed.resize((in_targcharlen, in_alignment.shape[1])) | |
| return zoomed, pivot_points | |
| def gather_dist(in_mtr, in_points): | |
| # initialize with known size for fast | |
| full_coords = [(0, 0) for x in range(in_mtr.shape[0] * in_mtr.shape[1])] | |
| i = 0 | |
| for x in range(0, in_mtr.shape[0]): | |
| for y in range(0, in_mtr.shape[1]): | |
| full_coords[i] = (x, y) | |
| i += 1 | |
| return cdist(full_coords, in_points, "euclidean") | |
| def create_guided(in_align, in_pvt, looseness): | |
| new_att = np.ones(in_align.shape, dtype=np.float32) | |
| # It is dramatically faster that we first gather all the points and calculate than do it manually | |
| # for each point in for loop | |
| dist_arr = gather_dist(in_align, in_pvt) | |
| # Scale looseness based on attention size. (addition works better than mul). Also divide by 100 | |
| # because having user input 3.35 is nicer | |
| real_loose = (looseness / 100) * (new_att.shape[0] + new_att.shape[1]) | |
| g_idx = 0 | |
| for x in range(0, new_att.shape[0]): | |
| for y in range(0, new_att.shape[1]): | |
| min_point_idx = dist_arr[g_idx].argmin() | |
| closest_pvt = in_pvt[min_point_idx] | |
| distance = dist_arr[g_idx][min_point_idx] / real_loose | |
| distance = np.power(distance, 2) | |
| g_idx += 1 | |
| new_att[x, y] = distance | |
| return np.clip(new_att, 0.0, 1.0) | |
| def get_pivot_points(in_att): | |
| ret_points = [] | |
| for x in range(0, in_att.shape[0]): | |
| for y in range(0, in_att.shape[1]): | |
| if in_att[x, y] > 0.8: | |
| ret_points.append((x, y)) | |
| return ret_points | |
| def main(): | |
| parser = argparse.ArgumentParser( | |
| description="Postprocess durations to become alignments" | |
| ) | |
| parser.add_argument( | |
| "--dump-dir", | |
| default="dump", | |
| type=str, | |
| help="Path of dump directory", | |
| ) | |
| parser.add_argument( | |
| "--looseness", | |
| default=3.5, | |
| type=float, | |
| help="Looseness of the generated guided attention map. Lower values = tighter", | |
| ) | |
| args = parser.parse_args() | |
| dump_dir = args.dump_dir | |
| dump_sets = ["train", "valid"] | |
| for d_set in dump_sets: | |
| full_fol = os.path.join(dump_dir, d_set) | |
| align_path = os.path.join(full_fol, "alignments") | |
| ids_path = os.path.join(full_fol, "ids") | |
| durations_path = os.path.join(full_fol, "durations") | |
| safemkdir(align_path) | |
| for duration_fn in tqdm(os.listdir(durations_path)): | |
| if not ".npy" in duration_fn: | |
| continue | |
| id_fn = duration_fn.replace("-durations", "-ids") | |
| id_path = os.path.join(ids_path, id_fn) | |
| duration_path = os.path.join(durations_path, duration_fn) | |
| duration_arr = np.load(duration_path) | |
| id_arr = np.load(id_path) | |
| id_true_size = len(id_arr) | |
| align = duration_to_alignment(duration_arr) | |
| if align.shape[0] != id_true_size: | |
| align, points = rescale_alignment(align, id_true_size) | |
| else: | |
| points = get_pivot_points(align) | |
| if len(points) == 0: | |
| print("WARNING points are empty for", id_fn) | |
| align = create_guided(align, points, args.looseness) | |
| align_fn = id_fn.replace("-ids", "-alignment") | |
| align_full_fn = os.path.join(align_path, align_fn) | |
| np.save(align_full_fn, align.astype("float32")) | |
| if __name__ == "__main__": | |
| main() | |