| | from multiprocess.pool import ThreadPool |
| | from speaker_encoder.params_data import * |
| | from speaker_encoder.config import librispeech_datasets, anglophone_nationalites |
| | from datetime import datetime |
| | from speaker_encoder import audio |
| | from pathlib import Path |
| | from tqdm import tqdm |
| | import numpy as np |
| |
|
| |
|
| | class DatasetLog: |
| | """ |
| | Registers metadata about the dataset in a text file. |
| | """ |
| | def __init__(self, root, name): |
| | self.text_file = open(Path(root, "Log_%s.txt" % name.replace("/", "_")), "w") |
| | self.sample_data = dict() |
| | |
| | start_time = str(datetime.now().strftime("%A %d %B %Y at %H:%M")) |
| | self.write_line("Creating dataset %s on %s" % (name, start_time)) |
| | self.write_line("-----") |
| | self._log_params() |
| | |
| | def _log_params(self): |
| | from speaker_encoder import params_data |
| | self.write_line("Parameter values:") |
| | for param_name in (p for p in dir(params_data) if not p.startswith("__")): |
| | value = getattr(params_data, param_name) |
| | self.write_line("\t%s: %s" % (param_name, value)) |
| | self.write_line("-----") |
| | |
| | def write_line(self, line): |
| | self.text_file.write("%s\n" % line) |
| | |
| | def add_sample(self, **kwargs): |
| | for param_name, value in kwargs.items(): |
| | if not param_name in self.sample_data: |
| | self.sample_data[param_name] = [] |
| | self.sample_data[param_name].append(value) |
| | |
| | def finalize(self): |
| | self.write_line("Statistics:") |
| | for param_name, values in self.sample_data.items(): |
| | self.write_line("\t%s:" % param_name) |
| | self.write_line("\t\tmin %.3f, max %.3f" % (np.min(values), np.max(values))) |
| | self.write_line("\t\tmean %.3f, median %.3f" % (np.mean(values), np.median(values))) |
| | self.write_line("-----") |
| | end_time = str(datetime.now().strftime("%A %d %B %Y at %H:%M")) |
| | self.write_line("Finished on %s" % end_time) |
| | self.text_file.close() |
| | |
| | |
| | def _init_preprocess_dataset(dataset_name, datasets_root, out_dir) -> (Path, DatasetLog): |
| | dataset_root = datasets_root.joinpath(dataset_name) |
| | if not dataset_root.exists(): |
| | print("Couldn\'t find %s, skipping this dataset." % dataset_root) |
| | return None, None |
| | return dataset_root, DatasetLog(out_dir, dataset_name) |
| |
|
| |
|
| | def _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, extension, |
| | skip_existing, logger): |
| | print("%s: Preprocessing data for %d speakers." % (dataset_name, len(speaker_dirs))) |
| | |
| | |
| | def preprocess_speaker(speaker_dir: Path): |
| | |
| | speaker_name = "_".join(speaker_dir.relative_to(datasets_root).parts) |
| | |
| | |
| | |
| | speaker_out_dir = out_dir.joinpath(speaker_name) |
| | speaker_out_dir.mkdir(exist_ok=True) |
| | sources_fpath = speaker_out_dir.joinpath("_sources.txt") |
| | |
| | |
| | |
| | if sources_fpath.exists(): |
| | try: |
| | with sources_fpath.open("r") as sources_file: |
| | existing_fnames = {line.split(",")[0] for line in sources_file} |
| | except: |
| | existing_fnames = {} |
| | else: |
| | existing_fnames = {} |
| | |
| | |
| | sources_file = sources_fpath.open("a" if skip_existing else "w") |
| | for in_fpath in speaker_dir.glob("**/*.%s" % extension): |
| | |
| | out_fname = "_".join(in_fpath.relative_to(speaker_dir).parts) |
| | out_fname = out_fname.replace(".%s" % extension, ".npy") |
| | if skip_existing and out_fname in existing_fnames: |
| | continue |
| | |
| | |
| | wav = audio.preprocess_wav(in_fpath) |
| | if len(wav) == 0: |
| | continue |
| | |
| | |
| | frames = audio.wav_to_mel_spectrogram(wav) |
| | if len(frames) < partials_n_frames: |
| | continue |
| | |
| | out_fpath = speaker_out_dir.joinpath(out_fname) |
| | np.save(out_fpath, frames) |
| | logger.add_sample(duration=len(wav) / sampling_rate) |
| | sources_file.write("%s,%s\n" % (out_fname, in_fpath)) |
| | |
| | sources_file.close() |
| | |
| | |
| | with ThreadPool(8) as pool: |
| | list(tqdm(pool.imap(preprocess_speaker, speaker_dirs), dataset_name, len(speaker_dirs), |
| | unit="speakers")) |
| | logger.finalize() |
| | print("Done preprocessing %s.\n" % dataset_name) |
| |
|
| |
|
| | |
| | def __preprocess_speaker(speaker_dir: Path, datasets_root: Path, out_dir: Path, extension: str, skip_existing: bool): |
| | |
| | speaker_name = "_".join(speaker_dir.relative_to(datasets_root).parts) |
| | |
| | |
| | |
| | speaker_out_dir = out_dir.joinpath(speaker_name) |
| | speaker_out_dir.mkdir(exist_ok=True) |
| | sources_fpath = speaker_out_dir.joinpath("_sources.txt") |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | existing_fnames = {} |
| | |
| | sources_file = sources_fpath.open("a" if skip_existing else "w") |
| |
|
| | for in_fpath in speaker_dir.glob("**/*.%s" % extension): |
| | |
| | out_fname = "_".join(in_fpath.relative_to(speaker_dir).parts) |
| | out_fname = out_fname.replace(".%s" % extension, ".npy") |
| | if skip_existing and out_fname in existing_fnames: |
| | continue |
| | |
| | |
| | wav = audio.preprocess_wav(in_fpath) |
| | if len(wav) == 0: |
| | continue |
| | |
| | |
| | frames = audio.wav_to_mel_spectrogram(wav) |
| | if len(frames) < partials_n_frames: |
| | continue |
| | |
| | out_fpath = speaker_out_dir.joinpath(out_fname) |
| | np.save(out_fpath, frames) |
| | |
| | sources_file.write("%s,%s\n" % (out_fname, in_fpath)) |
| | |
| | sources_file.close() |
| | return len(wav) |
| |
|
| | def _preprocess_speaker_dirs_vox2(speaker_dirs, dataset_name, datasets_root, out_dir, extension, |
| | skip_existing, logger): |
| | |
| | from pathos.multiprocessing import ProcessingPool as Pool |
| | |
| | def __preprocess_speaker(speaker_dir: Path): |
| | |
| | speaker_name = "_".join(speaker_dir.relative_to(datasets_root).parts) |
| | |
| | |
| | |
| | speaker_out_dir = out_dir.joinpath(speaker_name) |
| | speaker_out_dir.mkdir(exist_ok=True) |
| | sources_fpath = speaker_out_dir.joinpath("_sources.txt") |
| | |
| | existing_fnames = {} |
| | |
| | sources_file = sources_fpath.open("a" if skip_existing else "w") |
| | wav_lens = [] |
| | for in_fpath in speaker_dir.glob("**/*.%s" % extension): |
| | |
| | out_fname = "_".join(in_fpath.relative_to(speaker_dir).parts) |
| | out_fname = out_fname.replace(".%s" % extension, ".npy") |
| | if skip_existing and out_fname in existing_fnames: |
| | continue |
| | |
| | |
| | wav = audio.preprocess_wav(in_fpath) |
| | if len(wav) == 0: |
| | continue |
| | |
| | |
| | frames = audio.wav_to_mel_spectrogram(wav) |
| | if len(frames) < partials_n_frames: |
| | continue |
| | |
| | out_fpath = speaker_out_dir.joinpath(out_fname) |
| | np.save(out_fpath, frames) |
| | |
| | sources_file.write("%s,%s\n" % (out_fname, in_fpath)) |
| | wav_lens.append(len(wav)) |
| | sources_file.close() |
| | return wav_lens |
| |
|
| | print("%s: Preprocessing data for %d speakers." % (dataset_name, len(speaker_dirs))) |
| | |
| | |
| | |
| | |
| | pool = Pool(processes=20) |
| | for i, wav_lens in enumerate(pool.map(__preprocess_speaker, speaker_dirs), 1): |
| | for wav_len in wav_lens: |
| | logger.add_sample(duration=wav_len / sampling_rate) |
| | print(f'{i}/{len(speaker_dirs)} \r') |
| |
|
| | logger.finalize() |
| | print("Done preprocessing %s.\n" % dataset_name) |
| |
|
| |
|
| | def preprocess_librispeech(datasets_root: Path, out_dir: Path, skip_existing=False): |
| | for dataset_name in librispeech_datasets["train"]["other"]: |
| | |
| | dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir) |
| | if not dataset_root: |
| | return |
| | |
| | |
| | speaker_dirs = list(dataset_root.glob("*")) |
| | _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, "flac", |
| | skip_existing, logger) |
| |
|
| |
|
| | def preprocess_voxceleb1(datasets_root: Path, out_dir: Path, skip_existing=False): |
| | |
| | dataset_name = "VoxCeleb1" |
| | dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir) |
| | if not dataset_root: |
| | return |
| |
|
| | |
| | with dataset_root.joinpath("vox1_meta.csv").open("r") as metafile: |
| | metadata = [line.split("\t") for line in metafile][1:] |
| | |
| | |
| | nationalities = {line[0]: line[3] for line in metadata} |
| | |
| | |
| | keep_speaker_ids = [speaker_id for speaker_id, nationality in nationalities.items()] |
| | print("VoxCeleb1: using samples from %d (presumed anglophone) speakers out of %d." % |
| | (len(keep_speaker_ids), len(nationalities))) |
| | |
| | |
| | speaker_dirs = dataset_root.joinpath("wav").glob("*") |
| | speaker_dirs = [speaker_dir for speaker_dir in speaker_dirs if |
| | speaker_dir.name in keep_speaker_ids] |
| | print("VoxCeleb1: found %d anglophone speakers on the disk, %d missing (this is normal)." % |
| | (len(speaker_dirs), len(keep_speaker_ids) - len(speaker_dirs))) |
| |
|
| | |
| | _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, "wav", |
| | skip_existing, logger) |
| |
|
| |
|
| | def preprocess_voxceleb2(datasets_root: Path, out_dir: Path, skip_existing=False): |
| | |
| | dataset_name = "VoxCeleb2" |
| | dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir) |
| | if not dataset_root: |
| | return |
| | |
| | |
| | |
| | speaker_dirs = list(dataset_root.joinpath("dev", "aac").glob("*")) |
| | _preprocess_speaker_dirs_vox2(speaker_dirs, dataset_name, datasets_root, out_dir, "m4a", |
| | skip_existing, logger) |
| |
|