| | |
| | |
| | |
| | |
| | |
| |
|
| | import argparse |
| | import os |
| | import os.path as osp |
| | import numpy as np |
| | import tqdm |
| | import torch |
| | import random |
| | from shutil import copyfile |
| |
|
| | from npy_append_array import NpyAppendArray |
| |
|
| |
|
| | def get_parser(): |
| | parser = argparse.ArgumentParser( |
| | description="transforms features via a given pca and stored them in target dir" |
| | ) |
| | |
| | parser.add_argument('source', help='directory with features') |
| | parser.add_argument('--split', help='which split to read', required=True) |
| | parser.add_argument('--save-dir', help='where to save the output', required=True) |
| | parser.add_argument('--cluster-dir', help='where the clusters are') |
| | parser.add_argument('--pooling', type=str, default='mean', choices=['mean', 'sample'], help='how to pool') |
| | |
| |
|
| | return parser |
| |
|
| |
|
| | def main(): |
| | parser = get_parser() |
| | args = parser.parse_args() |
| |
|
| | source_path = osp.join(args.source, args.split) |
| | cluster_path = osp.join(args.cluster_dir, args.split + ".src") |
| | print(f"data path: {source_path}") |
| |
|
| | features = np.load(source_path + ".npy", mmap_mode="r") |
| | sizes = [] |
| | offsets = [] |
| | offset = 0 |
| | with open(source_path + ".lengths", "r") as len_f: |
| | for line in len_f: |
| | length = int(line.rstrip()) |
| | sizes.append(length) |
| | offsets.append(offset) |
| | offset += length |
| |
|
| | clusters = [] |
| | with open(cluster_path, "r") as cf: |
| | for line in cf: |
| | line = line.rstrip() |
| | items = line.split() |
| | items = list(map(int, items)) |
| | clusters.append(items) |
| |
|
| | os.makedirs(args.save_dir, exist_ok=True) |
| | save_path = osp.join(args.save_dir, args.split) |
| |
|
| | copyfile(source_path + ".tsv", save_path + ".tsv") |
| |
|
| | if os.path.exists(source_path + ".phn"): |
| | copyfile(source_path + ".phn", save_path + ".phn") |
| | if os.path.exists(osp.join(args.source, "dict.phn.txt")): |
| | copyfile( |
| | osp.join(args.source, "dict.phn.txt"), |
| | osp.join(args.save_dir, "dict.phn.txt"), |
| | ) |
| | if os.path.exists(source_path + ".wrd"): |
| | copyfile(source_path + ".wrd", save_path + ".wrd") |
| |
|
| | if osp.exists(save_path + ".npy"): |
| | os.remove(save_path + ".npy") |
| | npaa = NpyAppendArray(save_path + ".npy") |
| |
|
| | def merge(feats, clust): |
| | feats = torch.from_numpy(feats.copy()) |
| | clust = torch.LongTensor(clust) |
| | _, counts = clust.unique_consecutive(return_counts=True) |
| | curr = 0 |
| |
|
| | merged = [] |
| | for c in counts: |
| | c = c.item() |
| | start = curr |
| | end = curr + c |
| | curr += c |
| | if args.pooling == "mean": |
| | new_x = feats[start:end].mean(dim=0) |
| | elif args.pooling == "sample": |
| | new_x = feats[start + int(random.random() * c)] |
| | else: |
| | raise NotImplementedError() |
| | merged.append(new_x) |
| |
|
| | return torch.stack(merged, dim=0).numpy() |
| |
|
| | with open(save_path + ".lengths", "w") as l_f: |
| | for size, offset, clust in tqdm.tqdm( |
| | zip(sizes, offsets, clusters), total=len(sizes) |
| | ): |
| | end = size + offset |
| | feats = features[offset:end] |
| | feats = merge(feats, clust) |
| | print(len(feats), file=l_f) |
| | npaa.append(feats) |
| |
|
| |
|
| | if __name__ == "__main__": |
| | main() |
| |
|