| |
|
|
| import argparse |
| import glob |
| import logging |
| import os |
| import sys |
|
|
| import numpy as np |
| from tqdm import tqdm |
|
|
| |
| from TTS.config import load_config |
| from TTS.tts.datasets import load_tts_samples |
| from TTS.utils.audio import AudioProcessor |
| from TTS.utils.generic_utils import ConsoleFormatter, setup_logger |
|
|
|
|
| def parse_args(arg_list: list[str] | None) -> tuple[argparse.Namespace, list[str]]: |
| parser = argparse.ArgumentParser(description="Compute mean and variance of spectrogtram features.") |
| parser.add_argument("config_path", type=str, help="TTS config file path to define audio processin parameters.") |
| parser.add_argument("out_path", type=str, help="save path (directory and filename).") |
| parser.add_argument( |
| "--data_path", |
| type=str, |
| required=False, |
| help="folder including the target set of wavs overriding dataset config.", |
| ) |
| return parser.parse_known_args(arg_list) |
|
|
|
|
| def main(arg_list: list[str] | None = None): |
| """Run preprocessing process.""" |
| setup_logger("TTS", level=logging.INFO, stream=sys.stderr, formatter=ConsoleFormatter()) |
| args, overrides = parse_args(arg_list) |
|
|
| CONFIG = load_config(args.config_path) |
| CONFIG.parse_known_args(overrides, relaxed_parser=True) |
|
|
| |
| CONFIG.audio.signal_norm = False |
| CONFIG.audio.stats_path = None |
|
|
| |
| ap = AudioProcessor(**CONFIG.audio.to_dict()) |
|
|
| |
| if args.data_path: |
| dataset_items = glob.glob(os.path.join(args.data_path, "**", "*.wav"), recursive=True) |
| else: |
| dataset_items = load_tts_samples(CONFIG.datasets)[0] |
| print(f" > There are {len(dataset_items)} files.") |
|
|
| mel_sum = 0 |
| mel_square_sum = 0 |
| linear_sum = 0 |
| linear_square_sum = 0 |
| N = 0 |
| for item in tqdm(dataset_items): |
| |
| wav = ap.load_wav(item if isinstance(item, str) else item["audio_file"]) |
| linear = ap.spectrogram(wav) |
| mel = ap.melspectrogram(wav) |
|
|
| |
| N += mel.shape[1] |
| mel_sum += mel.sum(1) |
| linear_sum += linear.sum(1) |
| mel_square_sum += (mel**2).sum(axis=1) |
| linear_square_sum += (linear**2).sum(axis=1) |
|
|
| mel_mean = mel_sum / N |
| mel_scale = np.sqrt(mel_square_sum / N - mel_mean**2) |
| linear_mean = linear_sum / N |
| linear_scale = np.sqrt(linear_square_sum / N - linear_mean**2) |
|
|
| output_file_path = args.out_path |
| stats = {} |
| stats["mel_mean"] = mel_mean |
| stats["mel_std"] = mel_scale |
| stats["linear_mean"] = linear_mean |
| stats["linear_std"] = linear_scale |
|
|
| print(f" > Avg mel spec mean: {mel_mean.mean()}") |
| print(f" > Avg mel spec scale: {mel_scale.mean()}") |
| print(f" > Avg linear spec mean: {linear_mean.mean()}") |
| print(f" > Avg linear spec scale: {linear_scale.mean()}") |
|
|
| |
| CONFIG.audio.stats_path = output_file_path |
| CONFIG.audio.signal_norm = True |
| |
| del CONFIG.audio.max_norm |
| del CONFIG.audio.min_level_db |
| del CONFIG.audio.symmetric_norm |
| del CONFIG.audio.clip_norm |
| stats["audio_config"] = CONFIG.audio.to_dict() |
| np.save(output_file_path, stats, allow_pickle=True) |
| print(f" > stats saved to {output_file_path}") |
| sys.exit(0) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|