| |
| |
| |
| |
|
|
| import logging |
| import os |
| import sys |
|
|
| import numpy as np |
| from sklearn.cluster import MiniBatchKMeans |
|
|
| import joblib |
|
|
| logging.basicConfig( |
| format="%(asctime)s | %(levelname)s | %(name)s | %(message)s", |
| datefmt="%Y-%m-%d %H:%M:%S", |
| level=os.environ.get("LOGLEVEL", "INFO").upper(), |
| stream=sys.stdout, |
| ) |
| logger = logging.getLogger("learn_kmeans") |
|
|
|
|
| def get_km_model( |
| n_clusters, |
| init, |
| max_iter, |
| batch_size, |
| tol, |
| max_no_improvement, |
| n_init, |
| reassignment_ratio, |
| ): |
| return MiniBatchKMeans( |
| n_clusters=n_clusters, |
| init=init, |
| max_iter=max_iter, |
| batch_size=batch_size, |
| verbose=1, |
| compute_labels=False, |
| tol=tol, |
| max_no_improvement=max_no_improvement, |
| init_size=None, |
| n_init=n_init, |
| reassignment_ratio=reassignment_ratio, |
| ) |
|
|
|
|
| def load_feature_shard(feat_dir, split, nshard, rank, percent): |
| feat_path = f"{feat_dir}/{split}_{rank}_{nshard}.npy" |
| leng_path = f"{feat_dir}/{split}_{rank}_{nshard}.len" |
| with open(leng_path, "r") as f: |
| lengs = [int(line.rstrip()) for line in f] |
| offsets = [0] + np.cumsum(lengs[:-1]).tolist() |
|
|
| if percent < 0: |
| return np.load(feat_path, mmap_mode="r") |
| else: |
| nsample = int(np.ceil(len(lengs) * percent)) |
| indices = np.random.choice(len(lengs), nsample, replace=False) |
| feat = np.load(feat_path, mmap_mode="r") |
| sampled_feat = np.concatenate( |
| [feat[offsets[i]: offsets[i] + lengs[i]] for i in indices], axis=0 |
| ) |
| logger.info( |
| ( |
| f"sampled {nsample} utterances, {len(sampled_feat)} frames " |
| f"from shard {rank}/{nshard}" |
| ) |
| ) |
| return sampled_feat |
|
|
|
|
| def load_feature(feat_dir, split, nshard, seed, percent): |
| assert percent <= 1.0 |
| feat = np.concatenate( |
| [ |
| load_feature_shard(feat_dir, split, nshard, r, percent) |
| for r in range(nshard) |
| ], |
| axis=0, |
| ) |
| logging.info(f"loaded feature with dimension {feat.shape}") |
| return feat |
|
|
|
|
| def learn_kmeans( |
| feat_dir, |
| split, |
| nshard, |
| km_path, |
| n_clusters, |
| seed, |
| percent, |
| init, |
| max_iter, |
| batch_size, |
| tol, |
| n_init, |
| reassignment_ratio, |
| max_no_improvement, |
| ): |
| np.random.seed(seed) |
| feat = load_feature(feat_dir, split, nshard, seed, percent) |
| km_model = get_km_model( |
| n_clusters, |
| init, |
| max_iter, |
| batch_size, |
| tol, |
| max_no_improvement, |
| n_init, |
| reassignment_ratio, |
| ) |
| km_model.fit(feat) |
| joblib.dump(km_model, km_path) |
|
|
| inertia = -km_model.score(feat) / len(feat) |
| logger.info("total intertia: %.5f", inertia) |
| logger.info("finished successfully") |
|
|
|
|
| if __name__ == "__main__": |
| import argparse |
|
|
| parser = argparse.ArgumentParser() |
| parser.add_argument("feat_dir", type=str) |
| parser.add_argument("split", type=str) |
| parser.add_argument("nshard", type=int) |
| parser.add_argument("km_path", type=str) |
| parser.add_argument("n_clusters", type=int) |
| parser.add_argument("--seed", default=0, type=int) |
| parser.add_argument( |
| "--percent", default=-1, type=float, help="sample a subset; -1 for all" |
| ) |
| parser.add_argument("--init", default="k-means++") |
| parser.add_argument("--max_iter", default=100, type=int) |
| parser.add_argument("--batch_size", default=10000, type=int) |
| parser.add_argument("--tol", default=0.0, type=float) |
| parser.add_argument("--max_no_improvement", default=100, type=int) |
| parser.add_argument("--n_init", default=20, type=int) |
| parser.add_argument("--reassignment_ratio", default=0.0, type=float) |
| args = parser.parse_args() |
| logging.info(str(args)) |
|
|
| learn_kmeans(**vars(args)) |
|
|