haoxiangsnr commited on Jan 4, 2025

Commit

fe777b2

verified ·

1 Parent(s): 7024957

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

Amphion/models/base/base_trainer.py +348 -0
Amphion/models/codec/ns3_codec/__pycache__/melspec.cpython-310.pyc +0 -0
Amphion/models/codec/ns3_codec/__pycache__/transformer.cpython-310.pyc +0 -0
Amphion/models/codec/ns3_codec/alias_free_torch/filter.py +96 -0
Amphion/models/svc/diffusion/diffusion_trainer.py +102 -0
Amphion/models/tta/autoencoder/__init__.py +0 -0
Amphion/models/tta/autoencoder/autoencoder_loss.py +305 -0
Amphion/models/tta/ldm/audioldm_inference.py +193 -0
Amphion/models/tts/base/tts_inferece.py +278 -0
Amphion/models/tts/fastspeech2/fs2_dataset.py +424 -0
Amphion/models/tts/naturalspeech2/diffusion.py +124 -0
Amphion/models/tts/vits/vits_inference.py +163 -0
Amphion/models/vocoders/flow/flow_vocoder_trainer.py +0 -0
Amphion/models/vocoders/gan/gan_vocoder_dataset.py +205 -0
Amphion/modules/anti_aliasing/__init__.py +8 -0
Amphion/modules/encoder/condition_encoder.py +244 -0
Amphion/modules/general/__init__.py +3 -0
Amphion/modules/monotonic_align/__init__.py +21 -0
Amphion/modules/neural_source_filter/__init__.py +6 -0
Amphion/modules/transformer/Layers.py +137 -0
Amphion/modules/wenet_extractor/cif/predictor.py +274 -0
Amphion/modules/wenet_extractor/paraformer/search/beam_search.py +479 -0
Amphion/modules/wenet_extractor/paraformer/search/ctc_prefix_score.py +377 -0
Amphion/modules/wenet_extractor/squeezeformer/positionwise_feed_forward.py +88 -0
Amphion/modules/wenet_extractor/transformer/decoder_layer.py +140 -0
Amphion/modules/wenet_extractor/transformer/subsampling.py +257 -0
Amphion/modules/wenet_extractor/utils/__init__.py +0 -0
Amphion/preprocessors/cdmusiceval.py +174 -0
Amphion/utils/data_utils.py +588 -0
Amphion/utils/distribution.py +270 -0
Amphion/utils/mel.py +280 -0
Amphion/utils/prompt_preparer.py +68 -0
__pycache__/model.cpython-310.pyc +0 -0
conf/default.yaml +70 -0
exp/bmi__fa-codec/2024_05_20--16_21_26.log +4 -0
exp/bmi__fa-codec/2024_05_20--16_22_35.log +110 -0
exp/bmi__fa-codec/2024_05_20--16_24_01.log +4 -0
exp/bmi__fa-codec/amplified_signals/S06021_L0014_HA-output.wav +0 -0
exp/bmi__fa-codec/amplified_signals/S06026_L0088_HA-output.wav +0 -0
exp/bmi__fa-codec/amplified_signals/S06031_L0096_HA-output.wav +0 -0
exp/bmi__fa-codec/amplified_signals/S06036_L0036_HA-output.wav +0 -0
exp/bmi__fa-codec/amplified_signals/S06066_L0042_HA-output.wav +0 -0
exp/bmi__fa-codec/amplified_signals/S06071_L0089_HA-output.wav +0 -0
exp/bmi__fa-codec/amplified_signals/S06086_L0072_HA-output.wav +0 -0
exp/bmi__fa-codec/amplified_signals/S06091_L0099_HA-output.wav +0 -0
exp/bmi__fa-codec/amplified_signals/S06101_L0042_HA-output.wav +0 -0
exp/bmi__fa-codec/amplified_signals/S06111_L0002_HA-output.wav +0 -0
exp/bmi__fa-codec/amplified_signals/S06116_L0092_HA-output.wav +0 -0
exp/bmi__fa-codec/amplified_signals/S06126_L0069_HA-output.wav +0 -0
exp/bmi__fa-codec/amplified_signals/S06146_L0017_HA-output.wav +0 -0

Amphion/models/base/base_trainer.py ADDED Viewed

	@@ -0,0 +1,348 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import collections
+import json
+import os
+import sys
+import time
+import torch
+import torch.distributed as dist
+from torch.nn.parallel import DistributedDataParallel
+from torch.utils.data import ConcatDataset, DataLoader
+from torch.utils.tensorboard import SummaryWriter
+from models.base.base_sampler import BatchSampler
+from utils.util import (
+    Logger,
+    remove_older_ckpt,
+    save_config,
+    set_all_random_seed,
+    ValueWindow,
+)
+class BaseTrainer(object):
+    def __init__(self, args, cfg):
+        self.args = args
+        self.log_dir = args.log_dir
+        self.cfg = cfg
+        self.checkpoint_dir = os.path.join(args.log_dir, "checkpoints")
+        os.makedirs(self.checkpoint_dir, exist_ok=True)
+        if not cfg.train.ddp or args.local_rank == 0:
+            self.sw = SummaryWriter(os.path.join(args.log_dir, "events"))
+            self.logger = self.build_logger()
+        self.time_window = ValueWindow(50)
+        self.step = 0
+        self.epoch = -1
+        self.max_epochs = self.cfg.train.epochs
+        self.max_steps = self.cfg.train.max_steps
+        # set random seed & init distributed training
+        set_all_random_seed(self.cfg.train.random_seed)
+        if cfg.train.ddp:
+            dist.init_process_group(backend="nccl")
+        if cfg.model_type not in ["AutoencoderKL", "AudioLDM"]:
+            self.singers = self.build_singers_lut()
+        # setup data_loader
+        self.data_loader = self.build_data_loader()
+        # setup model & enable distributed training
+        self.model = self.build_model()
+        print(self.model)
+        if isinstance(self.model, dict):
+            for key, value in self.model.items():
+                value.cuda(self.args.local_rank)
+                if key == "PQMF":
+                    continue
+                if cfg.train.ddp:
+                    self.model[key] = DistributedDataParallel(
+                        value, device_ids=[self.args.local_rank]
+                    )
+        else:
+            self.model.cuda(self.args.local_rank)
+            if cfg.train.ddp:
+                self.model = DistributedDataParallel(
+                    self.model, device_ids=[self.args.local_rank]
+                )
+        # create criterion
+        self.criterion = self.build_criterion()
+        if isinstance(self.criterion, dict):
+            for key, value in self.criterion.items():
+                self.criterion[key].cuda(args.local_rank)
+        else:
+            self.criterion.cuda(self.args.local_rank)
+        # optimizer
+        self.optimizer = self.build_optimizer()
+        self.scheduler = self.build_scheduler()
+        # save config file
+        self.config_save_path = os.path.join(self.checkpoint_dir, "args.json")
+    def build_logger(self):
+        log_file = os.path.join(self.checkpoint_dir, "train.log")
+        logger = Logger(log_file, level=self.args.log_level).logger
+        return logger
+    def build_dataset(self):
+        raise NotImplementedError
+    def build_data_loader(self):
+        Dataset, Collator = self.build_dataset()
+        # build dataset instance for each dataset and combine them by ConcatDataset
+        datasets_list = []
+        for dataset in self.cfg.dataset:
+            subdataset = Dataset(self.cfg, dataset, is_valid=False)
+            datasets_list.append(subdataset)
+        train_dataset = ConcatDataset(datasets_list)
+        train_collate = Collator(self.cfg)
+        # TODO: multi-GPU training
+        if self.cfg.train.ddp:
+            raise NotImplementedError("DDP is not supported yet.")
+        # sampler will provide indices to batch_sampler, which will perform batching and yield batch indices
+        batch_sampler = BatchSampler(
+            cfg=self.cfg, concat_dataset=train_dataset, dataset_list=datasets_list
+        )
+        # use batch_sampler argument instead of (sampler, shuffle, drop_last, batch_size)
+        train_loader = DataLoader(
+            train_dataset,
+            collate_fn=train_collate,
+            num_workers=self.args.num_workers,
+            batch_sampler=batch_sampler,
+            pin_memory=False,
+        )
+        if not self.cfg.train.ddp or self.args.local_rank == 0:
+            datasets_list = []
+            for dataset in self.cfg.dataset:
+                subdataset = Dataset(self.cfg, dataset, is_valid=True)
+                datasets_list.append(subdataset)
+            valid_dataset = ConcatDataset(datasets_list)
+            valid_collate = Collator(self.cfg)
+            batch_sampler = BatchSampler(
+                cfg=self.cfg, concat_dataset=valid_dataset, dataset_list=datasets_list
+            )
+            valid_loader = DataLoader(
+                valid_dataset,
+                collate_fn=valid_collate,
+                num_workers=1,
+                batch_sampler=batch_sampler,
+            )
+        else:
+            raise NotImplementedError("DDP is not supported yet.")
+            # valid_loader = None
+        data_loader = {"train": train_loader, "valid": valid_loader}
+        return data_loader
+    def build_singers_lut(self):
+        # combine singers
+        if not os.path.exists(os.path.join(self.log_dir, self.cfg.preprocess.spk2id)):
+            singers = collections.OrderedDict()
+        else:
+            with open(
+                os.path.join(self.log_dir, self.cfg.preprocess.spk2id), "r"
+            ) as singer_file:
+                singers = json.load(singer_file)
+        singer_count = len(singers)
+        for dataset in self.cfg.dataset:
+            singer_lut_path = os.path.join(
+                self.cfg.preprocess.processed_dir, dataset, self.cfg.preprocess.spk2id
+            )
+            with open(singer_lut_path, "r") as singer_lut_path:
+                singer_lut = json.load(singer_lut_path)
+            for singer in singer_lut.keys():
+                if singer not in singers:
+                    singers[singer] = singer_count
+                    singer_count += 1
+        with open(
+            os.path.join(self.log_dir, self.cfg.preprocess.spk2id), "w"
+        ) as singer_file:
+            json.dump(singers, singer_file, indent=4, ensure_ascii=False)
+        print(
+            "singers have been dumped to {}".format(
+                os.path.join(self.log_dir, self.cfg.preprocess.spk2id)
+            )
+        )
+        return singers
+    def build_model(self):
+        raise NotImplementedError()
+    def build_optimizer(self):
+        raise NotImplementedError
+    def build_scheduler(self):
+        raise NotImplementedError()
+    def build_criterion(self):
+        raise NotImplementedError
+    def get_state_dict(self):
+        raise NotImplementedError
+    def save_config_file(self):
+        save_config(self.config_save_path, self.cfg)
+    # TODO, save without module.
+    def save_checkpoint(self, state_dict, saved_model_path):
+        torch.save(state_dict, saved_model_path)
+    def load_checkpoint(self):
+        checkpoint_path = os.path.join(self.checkpoint_dir, "checkpoint")
+        assert os.path.exists(checkpoint_path)
+        checkpoint_filename = open(checkpoint_path).readlines()[-1].strip()
+        model_path = os.path.join(self.checkpoint_dir, checkpoint_filename)
+        assert os.path.exists(model_path)
+        if not self.cfg.train.ddp or self.args.local_rank == 0:
+            self.logger.info(f"Re(store) from {model_path}")
+        checkpoint = torch.load(model_path, map_location="cpu")
+        return checkpoint
+    def load_model(self, checkpoint):
+        raise NotImplementedError
+    def restore(self):
+        checkpoint = self.load_checkpoint()
+        self.load_model(checkpoint)
+    def train_step(self, data):
+        raise NotImplementedError(
+            f"Need to implement function {sys._getframe().f_code.co_name} in "
+            f"your sub-class of {self.__class__.__name__}. "
+        )
+    @torch.no_grad()
+    def eval_step(self):
+        raise NotImplementedError(
+            f"Need to implement function {sys._getframe().f_code.co_name} in "
+            f"your sub-class of {self.__class__.__name__}. "
+        )
+    def write_summary(self, losses, stats):
+        raise NotImplementedError(
+            f"Need to implement function {sys._getframe().f_code.co_name} in "
+            f"your sub-class of {self.__class__.__name__}. "
+        )
+    def write_valid_summary(self, losses, stats):
+        raise NotImplementedError(
+            f"Need to implement function {sys._getframe().f_code.co_name} in "
+            f"your sub-class of {self.__class__.__name__}. "
+        )
+    def echo_log(self, losses, mode="Training"):
+        message = [
+            "{} - Epoch {} Step {}: [{:.3f} s/step]".format(
+                mode, self.epoch + 1, self.step, self.time_window.average
+            )
+        ]
+        for key in sorted(losses.keys()):
+            if isinstance(losses[key], dict):
+                for k, v in losses[key].items():
+                    message.append(
+                        str(k).split("/")[-1] + "=" + str(round(float(v), 5))
+                    )
+            else:
+                message.append(
+                    str(key).split("/")[-1] + "=" + str(round(float(losses[key]), 5))
+                )
+        self.logger.info(", ".join(message))
+    def eval_epoch(self):
+        self.logger.info("Validation...")
+        valid_losses = {}
+        for i, batch_data in enumerate(self.data_loader["valid"]):
+            for k, v in batch_data.items():
+                if isinstance(v, torch.Tensor):
+                    batch_data[k] = v.cuda()
+            valid_loss, valid_stats, total_valid_loss = self.eval_step(batch_data, i)
+            for key in valid_loss:
+                if key not in valid_losses:
+                    valid_losses[key] = 0
+                valid_losses[key] += valid_loss[key]
+        # Add mel and audio to the Tensorboard
+        # Average loss
+        for key in valid_losses:
+            valid_losses[key] /= i + 1
+        self.echo_log(valid_losses, "Valid")
+        return valid_losses, valid_stats
+    def train_epoch(self):
+        for i, batch_data in enumerate(self.data_loader["train"]):
+            start_time = time.time()
+            # Put the data to cuda device
+            for k, v in batch_data.items():
+                if isinstance(v, torch.Tensor):
+                    batch_data[k] = v.cuda(self.args.local_rank)
+            # Training step
+            train_losses, train_stats, total_loss = self.train_step(batch_data)
+            self.time_window.append(time.time() - start_time)
+            if self.args.local_rank == 0 or not self.cfg.train.ddp:
+                if self.step % self.args.stdout_interval == 0:
+                    self.echo_log(train_losses, "Training")
+                if self.step % self.cfg.train.save_summary_steps == 0:
+                    self.logger.info(f"Save summary as step {self.step}")
+                    self.write_summary(train_losses, train_stats)
+                if (
+                    self.step % self.cfg.train.save_checkpoints_steps == 0
+                    and self.step != 0
+                ):
+                    saved_model_name = "step-{:07d}_loss-{:.4f}.pt".format(
+                        self.step, total_loss
+                    )
+                    saved_model_path = os.path.join(
+                        self.checkpoint_dir, saved_model_name
+                    )
+                    saved_state_dict = self.get_state_dict()
+                    self.save_checkpoint(saved_state_dict, saved_model_path)
+                    self.save_config_file()
+                    # keep max n models
+                    remove_older_ckpt(
+                        saved_model_name,
+                        self.checkpoint_dir,
+                        max_to_keep=self.cfg.train.keep_checkpoint_max,
+                    )
+                if self.step != 0 and self.step % self.cfg.train.valid_interval == 0:
+                    if isinstance(self.model, dict):
+                        for key in self.model.keys():
+                            self.model[key].eval()
+                    else:
+                        self.model.eval()
+                    # Evaluate one epoch and get average loss
+                    valid_losses, valid_stats = self.eval_epoch()
+                    if isinstance(self.model, dict):
+                        for key in self.model.keys():
+                            self.model[key].train()
+                    else:
+                        self.model.train()
+                    # Write validation losses to summary.
+                    self.write_valid_summary(valid_losses, valid_stats)
+            self.step += 1
+    def train(self):
+        for epoch in range(max(0, self.epoch), self.max_epochs):
+            self.train_epoch()
+            self.epoch += 1
+            if self.step > self.max_steps:
+                self.logger.info("Training finished!")
+                break

Amphion/models/codec/ns3_codec/__pycache__/melspec.cpython-310.pyc ADDED Viewed

Binary file (2.81 kB). View file

Amphion/models/codec/ns3_codec/__pycache__/transformer.cpython-310.pyc ADDED Viewed

Binary file (5.42 kB). View file

Amphion/models/codec/ns3_codec/alias_free_torch/filter.py ADDED Viewed

	@@ -0,0 +1,96 @@

+# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+if "sinc" in dir(torch):
+    sinc = torch.sinc
+else:
+    # This code is adopted from adefossez's julius.core.sinc under the MIT License
+    # https://adefossez.github.io/julius/julius/core.html
+    def sinc(x: torch.Tensor):
+        """
+        Implementation of sinc, i.e. sin(pi * x) / (pi * x)
+        __Warning__: Different to julius.sinc, the input is multiplied by `pi`!
+        """
+        return torch.where(
+            x == 0,
+            torch.tensor(1.0, device=x.device, dtype=x.dtype),
+            torch.sin(math.pi * x) / math.pi / x,
+        )
+# This code is adopted from adefossez's julius.lowpass.LowPassFilters under the MIT License
+# https://adefossez.github.io/julius/julius/lowpass.html
+def kaiser_sinc_filter1d(
+    cutoff, half_width, kernel_size
+):  # return filter [1,1,kernel_size]
+    even = kernel_size % 2 == 0
+    half_size = kernel_size // 2
+    # For kaiser window
+    delta_f = 4 * half_width
+    A = 2.285 * (half_size - 1) * math.pi * delta_f + 7.95
+    if A > 50.0:
+        beta = 0.1102 * (A - 8.7)
+    elif A >= 21.0:
+        beta = 0.5842 * (A - 21) ** 0.4 + 0.07886 * (A - 21.0)
+    else:
+        beta = 0.0
+    window = torch.kaiser_window(kernel_size, beta=beta, periodic=False)
+    # ratio = 0.5/cutoff -> 2 * cutoff = 1 / ratio
+    if even:
+        time = torch.arange(-half_size, half_size) + 0.5
+    else:
+        time = torch.arange(kernel_size) - half_size
+    if cutoff == 0:
+        filter_ = torch.zeros_like(time)
+    else:
+        filter_ = 2 * cutoff * window * sinc(2 * cutoff * time)
+        # Normalize filter to have sum = 1, otherwise we will have a small leakage
+        # of the constant component in the input signal.
+        filter_ /= filter_.sum()
+        filter = filter_.view(1, 1, kernel_size)
+    return filter
+class LowPassFilter1d(nn.Module):
+    def __init__(
+        self,
+        cutoff=0.5,
+        half_width=0.6,
+        stride: int = 1,
+        padding: bool = True,
+        padding_mode: str = "replicate",
+        kernel_size: int = 12,
+    ):
+        # kernel_size should be even number for stylegan3 setup,
+        # in this implementation, odd number is also possible.
+        super().__init__()
+        if cutoff < -0.0:
+            raise ValueError("Minimum cutoff must be larger than zero.")
+        if cutoff > 0.5:
+            raise ValueError("A cutoff above 0.5 does not make sense.")
+        self.kernel_size = kernel_size
+        self.even = kernel_size % 2 == 0
+        self.pad_left = kernel_size // 2 - int(self.even)
+        self.pad_right = kernel_size // 2
+        self.stride = stride
+        self.padding = padding
+        self.padding_mode = padding_mode
+        filter = kaiser_sinc_filter1d(cutoff, half_width, kernel_size)
+        self.register_buffer("filter", filter)
+    # input [B, C, T]
+    def forward(self, x):
+        _, C, _ = x.shape
+        if self.padding:
+            x = F.pad(x, (self.pad_left, self.pad_right), mode=self.padding_mode)
+        out = F.conv1d(x, self.filter.expand(C, -1, -1), stride=self.stride, groups=C)
+        return out

Amphion/models/svc/diffusion/diffusion_trainer.py ADDED Viewed

	@@ -0,0 +1,102 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+from diffusers import DDPMScheduler
+from models.svc.base import SVCTrainer
+from modules.encoder.condition_encoder import ConditionEncoder
+from .diffusion_wrapper import DiffusionWrapper
+class DiffusionTrainer(SVCTrainer):
+    r"""The base trainer for all diffusion models. It inherits from SVCTrainer and
+    implements ``_build_model`` and ``_forward_step`` methods.
+    """
+    def __init__(self, args=None, cfg=None):
+        SVCTrainer.__init__(self, args, cfg)
+        # Only for SVC tasks using diffusion
+        self.noise_scheduler = DDPMScheduler(
+            **self.cfg.model.diffusion.scheduler_settings,
+        )
+        self.diffusion_timesteps = (
+            self.cfg.model.diffusion.scheduler_settings.num_train_timesteps
+        )
+    ### Following are methods only for diffusion models ###
+    def _build_model(self):
+        r"""Build the model for training. This function is called in ``__init__`` function."""
+        # TODO: sort out the config
+        self.cfg.model.condition_encoder.f0_min = self.cfg.preprocess.f0_min
+        self.cfg.model.condition_encoder.f0_max = self.cfg.preprocess.f0_max
+        self.condition_encoder = ConditionEncoder(self.cfg.model.condition_encoder)
+        self.acoustic_mapper = DiffusionWrapper(self.cfg)
+        model = torch.nn.ModuleList([self.condition_encoder, self.acoustic_mapper])
+        num_of_params_encoder = self.count_parameters(self.condition_encoder)
+        num_of_params_am = self.count_parameters(self.acoustic_mapper)
+        num_of_params = num_of_params_encoder + num_of_params_am
+        log = "Diffusion Model's Parameters: #Encoder is {:.2f}M, #Diffusion is {:.2f}M. The total is {:.2f}M".format(
+            num_of_params_encoder / 1e6, num_of_params_am / 1e6, num_of_params / 1e6
+        )
+        self.logger.info(log)
+        return model
+    def count_parameters(self, model):
+        model_param = 0.0
+        if isinstance(model, dict):
+            for key, value in model.items():
+                model_param += sum(p.numel() for p in model[key].parameters())
+        else:
+            model_param = sum(p.numel() for p in model.parameters())
+        return model_param
+    def _check_nan(self, batch, loss, y_pred, y_gt):
+        if torch.any(torch.isnan(loss)):
+            for k, v in batch.items():
+                self.logger.info(k)
+                self.logger.info(v)
+            super()._check_nan(loss, y_pred, y_gt)
+    def _forward_step(self, batch):
+        r"""Forward step for training and inference. This function is called
+        in ``_train_step`` & ``_test_step`` function.
+        """
+        device = self.accelerator.device
+        if self.online_features_extraction:
+            # On-the-fly features extraction
+            batch = self._extract_svc_features(batch)
+            # To debug
+            # for k, v in batch.items():
+            #     print(k, v.shape, v)
+            # exit()
+        mel_input = batch["mel"]
+        noise = torch.randn_like(mel_input, device=device, dtype=torch.float32)
+        batch_size = mel_input.size(0)
+        timesteps = torch.randint(
+            0,
+            self.diffusion_timesteps,
+            (batch_size,),
+            device=device,
+            dtype=torch.long,
+        )
+        noisy_mel = self.noise_scheduler.add_noise(mel_input, noise, timesteps)
+        conditioner = self.condition_encoder(batch)
+        y_pred = self.acoustic_mapper(noisy_mel, timesteps, conditioner)
+        loss = self._compute_loss(self.criterion, y_pred, noise, batch["mask"])
+        self._check_nan(batch, loss, y_pred, noise)
+        return loss

Amphion/models/tta/autoencoder/__init__.py ADDED Viewed

File without changes

Amphion/models/tta/autoencoder/autoencoder_loss.py ADDED Viewed

	@@ -0,0 +1,305 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import torch.nn as nn
+import functools
+import torch.nn.functional as F
+def hinge_d_loss(logits_real, logits_fake):
+    loss_real = torch.mean(F.relu(1.0 - logits_real))
+    loss_fake = torch.mean(F.relu(1.0 + logits_fake))
+    d_loss = 0.5 * (loss_real + loss_fake)
+    return d_loss
+def vanilla_d_loss(logits_real, logits_fake):
+    d_loss = 0.5 * (
+        torch.mean(F.softplus(-logits_real)) + torch.mean(F.softplus(logits_fake))
+    )
+    return d_loss
+def adopt_weight(weight, global_step, threshold=0, value=0.0):
+    if global_step < threshold:
+        weight = value
+    return weight
+class ActNorm(nn.Module):
+    def __init__(
+        self, num_features, logdet=False, affine=True, allow_reverse_init=False
+    ):
+        assert affine
+        super().__init__()
+        self.logdet = logdet
+        self.loc = nn.Parameter(torch.zeros(1, num_features, 1, 1))
+        self.scale = nn.Parameter(torch.ones(1, num_features, 1, 1))
+        self.allow_reverse_init = allow_reverse_init
+        self.register_buffer("initialized", torch.tensor(0, dtype=torch.uint8))
+    def initialize(self, input):
+        with torch.no_grad():
+            flatten = input.permute(1, 0, 2, 3).contiguous().view(input.shape[1], -1)
+            mean = (
+                flatten.mean(1)
+                .unsqueeze(1)
+                .unsqueeze(2)
+                .unsqueeze(3)
+                .permute(1, 0, 2, 3)
+            )
+            std = (
+                flatten.std(1)
+                .unsqueeze(1)
+                .unsqueeze(2)
+                .unsqueeze(3)
+                .permute(1, 0, 2, 3)
+            )
+            self.loc.data.copy_(-mean)
+            self.scale.data.copy_(1 / (std + 1e-6))
+    def forward(self, input, reverse=False):
+        if reverse:
+            return self.reverse(input)
+        if len(input.shape) == 2:
+            input = input[:, :, None, None]
+            squeeze = True
+        else:
+            squeeze = False
+        _, _, height, width = input.shape
+        if self.training and self.initialized.item() == 0:
+            self.initialize(input)
+            self.initialized.fill_(1)
+        h = self.scale * (input + self.loc)
+        if squeeze:
+            h = h.squeeze(-1).squeeze(-1)
+        if self.logdet:
+            log_abs = torch.log(torch.abs(self.scale))
+            logdet = height * width * torch.sum(log_abs)
+            logdet = logdet * torch.ones(input.shape[0]).to(input)
+            return h, logdet
+        return h
+    def reverse(self, output):
+        if self.training and self.initialized.item() == 0:
+            if not self.allow_reverse_init:
+                raise RuntimeError(
+                    "Initializing ActNorm in reverse direction is "
+                    "disabled by default. Use allow_reverse_init=True to enable."
+                )
+            else:
+                self.initialize(output)
+                self.initialized.fill_(1)
+        if len(output.shape) == 2:
+            output = output[:, :, None, None]
+            squeeze = True
+        else:
+            squeeze = False
+        h = output / self.scale - self.loc
+        if squeeze:
+            h = h.squeeze(-1).squeeze(-1)
+        return h
+def weights_init(m):
+    classname = m.__class__.__name__
+    if classname.find("Conv") != -1:
+        nn.init.normal_(m.weight.data, 0.0, 0.02)
+    elif classname.find("BatchNorm") != -1:
+        nn.init.normal_(m.weight.data, 1.0, 0.02)
+        nn.init.constant_(m.bias.data, 0)
+class NLayerDiscriminator(nn.Module):
+    """Defines a PatchGAN discriminator as in Pix2Pix
+    --> see https://github.com/junyanz/pytorch-CycleGAN-and-pix2pix/blob/master/models/networks.py
+    """
+    def __init__(self, input_nc=3, ndf=64, n_layers=3, use_actnorm=False):
+        """Construct a PatchGAN discriminator
+        Parameters:
+            input_nc (int)  -- the number of channels in input images
+            ndf (int)       -- the number of filters in the last conv layer
+            n_layers (int)  -- the number of conv layers in the discriminator
+            norm_layer      -- normalization layer
+        """
+        super(NLayerDiscriminator, self).__init__()
+        if not use_actnorm:
+            norm_layer = nn.BatchNorm2d
+        else:
+            norm_layer = ActNorm
+        if (
+            type(norm_layer) == functools.partial
+        ):  # no need to use bias as BatchNorm2d has affine parameters
+            use_bias = norm_layer.func != nn.BatchNorm2d
+        else:
+            use_bias = norm_layer != nn.BatchNorm2d
+        kw = 4
+        padw = 1
+        sequence = [
+            nn.Conv2d(input_nc, ndf, kernel_size=kw, stride=2, padding=padw),
+            nn.LeakyReLU(0.2, True),
+        ]
+        nf_mult = 1
+        nf_mult_prev = 1
+        for n in range(1, n_layers):  # gradually increase the number of filters
+            nf_mult_prev = nf_mult
+            nf_mult = min(2**n, 8)
+            sequence += [
+                nn.Conv2d(
+                    ndf * nf_mult_prev,
+                    ndf * nf_mult,
+                    kernel_size=kw,
+                    stride=2,
+                    padding=padw,
+                    bias=use_bias,
+                ),
+                norm_layer(ndf * nf_mult),
+                nn.LeakyReLU(0.2, True),
+            ]
+        nf_mult_prev = nf_mult
+        nf_mult = min(2**n_layers, 8)
+        sequence += [
+            nn.Conv2d(
+                ndf * nf_mult_prev,
+                ndf * nf_mult,
+                kernel_size=kw,
+                stride=1,
+                padding=padw,
+                bias=use_bias,
+            ),
+            norm_layer(ndf * nf_mult),
+            nn.LeakyReLU(0.2, True),
+        ]
+        sequence += [
+            nn.Conv2d(ndf * nf_mult, 1, kernel_size=kw, stride=1, padding=padw)
+        ]  # output 1 channel prediction map
+        self.main = nn.Sequential(*sequence)
+    def forward(self, input):
+        """Standard forward."""
+        return self.main(input)
+class AutoencoderLossWithDiscriminator(nn.Module):
+    def __init__(self, cfg):
+        super().__init__()
+        self.cfg = cfg
+        self.kl_weight = cfg.kl_weight
+        self.logvar = nn.Parameter(torch.ones(size=()) * cfg.logvar_init)
+        self.discriminator = NLayerDiscriminator(
+            input_nc=cfg.disc_in_channels,
+            n_layers=cfg.disc_num_layers,
+            use_actnorm=cfg.use_actnorm,
+        ).apply(weights_init)
+        self.discriminator_iter_start = cfg.disc_start
+        self.discriminator_weight = cfg.disc_weight
+        self.disc_factor = cfg.disc_factor
+        self.disc_loss = hinge_d_loss
+    def calculate_adaptive_weight(self, nll_loss, g_loss, last_layer):
+        nll_grads = torch.autograd.grad(nll_loss, last_layer, retain_graph=True)[0]
+        g_grads = torch.autograd.grad(g_loss, last_layer, retain_graph=True)[0]
+        d_weight = torch.norm(nll_grads) / (torch.norm(g_grads) + 1e-4)
+        d_weight = torch.clamp(
+            d_weight, self.cfg.min_adapt_d_weight, self.cfg.max_adapt_d_weight
+        ).detach()
+        d_weight = d_weight * self.discriminator_weight
+        return d_weight
+    def forward(
+        self,
+        inputs,
+        reconstructions,
+        posteriors,
+        optimizer_idx,
+        global_step,
+        last_layer,
+        split="train",
+        weights=None,
+    ):
+        rec_loss = torch.abs(
+            inputs.contiguous() - reconstructions.contiguous()
+        )  # l1 loss
+        nll_loss = rec_loss / torch.exp(self.logvar) + self.logvar
+        weighted_nll_loss = nll_loss
+        if weights is not None:
+            weighted_nll_loss = weights * nll_loss
+        # weighted_nll_loss = torch.sum(weighted_nll_loss) / weighted_nll_loss.shape[0]
+        weighted_nll_loss = torch.mean(weighted_nll_loss)
+        # nll_loss = torch.sum(nll_loss) / nll_loss.shape[0]
+        nll_loss = torch.mean(nll_loss)
+        kl_loss = posteriors.kl()
+        kl_loss = torch.sum(kl_loss) / kl_loss.shape[0]
+        # ? kl_loss = torch.mean(kl_loss)
+        # now the GAN part
+        if optimizer_idx == 0:
+            logits_fake = self.discriminator(reconstructions.contiguous())
+            g_loss = -torch.mean(logits_fake)
+            if self.disc_factor > 0.0:
+                try:
+                    d_weight = self.calculate_adaptive_weight(
+                        nll_loss, g_loss, last_layer=last_layer
+                    )
+                except RuntimeError:
+                    assert not self.training
+                    d_weight = torch.tensor(0.0)
+            else:
+                d_weight = torch.tensor(0.0)
+            disc_factor = adopt_weight(
+                self.disc_factor, global_step, threshold=self.discriminator_iter_start
+            )
+            total_loss = (
+                weighted_nll_loss
+                + self.kl_weight * kl_loss
+                + d_weight * disc_factor * g_loss
+            )
+            return {
+                "loss": total_loss,
+                "kl_loss": kl_loss,
+                "rec_loss": rec_loss.mean(),
+                "nll_loss": nll_loss,
+                "g_loss": g_loss,
+                "d_weight": d_weight,
+                "disc_factor": torch.tensor(disc_factor),
+            }
+        if optimizer_idx == 1:
+            logits_real = self.discriminator(inputs.contiguous().detach())
+            logits_fake = self.discriminator(reconstructions.contiguous().detach())
+            disc_factor = adopt_weight(
+                self.disc_factor, global_step, threshold=self.discriminator_iter_start
+            )
+            d_loss = disc_factor * self.disc_loss(logits_real, logits_fake)
+            return {
+                "d_loss": d_loss,
+                "logits_real": logits_real.mean(),
+                "logits_fake": logits_fake.mean(),
+            }

Amphion/models/tta/ldm/audioldm_inference.py ADDED Viewed

	@@ -0,0 +1,193 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import os
+import time
+import numpy as np
+import torch
+from tqdm import tqdm
+import torch.nn as nn
+from collections import OrderedDict
+import json
+from models.tta.autoencoder.autoencoder import AutoencoderKL
+from models.tta.ldm.inference_utils.vocoder import Generator
+from models.tta.ldm.audioldm import AudioLDM
+from transformers import T5EncoderModel, AutoTokenizer
+from diffusers import PNDMScheduler
+import matplotlib.pyplot as plt
+from scipy.io.wavfile import write
+class AttrDict(dict):
+    def __init__(self, *args, **kwargs):
+        super(AttrDict, self).__init__(*args, **kwargs)
+        self.__dict__ = self
+class AudioLDMInference:
+    def __init__(self, args, cfg):
+        self.cfg = cfg
+        self.args = args
+        self.build_autoencoderkl()
+        self.build_textencoder()
+        self.model = self.build_model()
+        self.load_state_dict()
+        self.build_vocoder()
+        self.out_path = self.args.output_dir
+        self.out_mel_path = os.path.join(self.out_path, "mel")
+        self.out_wav_path = os.path.join(self.out_path, "wav")
+        os.makedirs(self.out_mel_path, exist_ok=True)
+        os.makedirs(self.out_wav_path, exist_ok=True)
+    def build_autoencoderkl(self):
+        self.autoencoderkl = AutoencoderKL(self.cfg.model.autoencoderkl)
+        self.autoencoder_path = self.cfg.model.autoencoder_path
+        checkpoint = torch.load(self.autoencoder_path, map_location="cpu")
+        self.autoencoderkl.load_state_dict(checkpoint["model"])
+        self.autoencoderkl.cuda(self.args.local_rank)
+        self.autoencoderkl.requires_grad_(requires_grad=False)
+        self.autoencoderkl.eval()
+    def build_textencoder(self):
+        self.tokenizer = AutoTokenizer.from_pretrained("t5-base", model_max_length=512)
+        self.text_encoder = T5EncoderModel.from_pretrained("t5-base")
+        self.text_encoder.cuda(self.args.local_rank)
+        self.text_encoder.requires_grad_(requires_grad=False)
+        self.text_encoder.eval()
+    def build_vocoder(self):
+        config_file = os.path.join(self.args.vocoder_config_path)
+        with open(config_file) as f:
+            data = f.read()
+        json_config = json.loads(data)
+        h = AttrDict(json_config)
+        self.vocoder = Generator(h).to(self.args.local_rank)
+        checkpoint_dict = torch.load(
+            self.args.vocoder_path, map_location=self.args.local_rank
+        )
+        self.vocoder.load_state_dict(checkpoint_dict["generator"])
+    def build_model(self):
+        self.model = AudioLDM(self.cfg.model.audioldm)
+        return self.model
+    def load_state_dict(self):
+        self.checkpoint_path = self.args.checkpoint_path
+        checkpoint = torch.load(self.checkpoint_path, map_location="cpu")
+        self.model.load_state_dict(checkpoint["model"])
+        self.model.cuda(self.args.local_rank)
+    def get_text_embedding(self):
+        text = self.args.text
+        prompt = [text]
+        text_input = self.tokenizer(
+            prompt,
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            padding="do_not_pad",
+            return_tensors="pt",
+        )
+        text_embeddings = self.text_encoder(
+            text_input.input_ids.to(self.args.local_rank)
+        )[0]
+        max_length = text_input.input_ids.shape[-1]
+        uncond_input = self.tokenizer(
+            [""] * 1, padding="max_length", max_length=max_length, return_tensors="pt"
+        )
+        uncond_embeddings = self.text_encoder(
+            uncond_input.input_ids.to(self.args.local_rank)
+        )[0]
+        text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+        return text_embeddings
+    def inference(self):
+        text_embeddings = self.get_text_embedding()
+        print(text_embeddings.shape)
+        num_steps = self.args.num_steps
+        guidance_scale = self.args.guidance_scale
+        noise_scheduler = PNDMScheduler(
+            num_train_timesteps=1000,
+            beta_start=0.00085,
+            beta_end=0.012,
+            beta_schedule="scaled_linear",
+            skip_prk_steps=True,
+            set_alpha_to_one=False,
+            steps_offset=1,
+            prediction_type="epsilon",
+        )
+        noise_scheduler.set_timesteps(num_steps)
+        latents = torch.randn(
+            (
+                1,
+                self.cfg.model.autoencoderkl.z_channels,
+                80 // (2 ** (len(self.cfg.model.autoencoderkl.ch_mult) - 1)),
+                624 // (2 ** (len(self.cfg.model.autoencoderkl.ch_mult) - 1)),
+            )
+        ).to(self.args.local_rank)
+        self.model.eval()
+        for t in tqdm(noise_scheduler.timesteps):
+            t = t.to(self.args.local_rank)
+            # expand the latents if we are doing classifier-free guidance to avoid doing two forward passes.
+            latent_model_input = torch.cat([latents] * 2)
+            latent_model_input = noise_scheduler.scale_model_input(
+                latent_model_input, timestep=t
+            )
+            # print(latent_model_input.shape)
+            # predict the noise residual
+            with torch.no_grad():
+                noise_pred = self.model(
+                    latent_model_input, torch.cat([t.unsqueeze(0)] * 2), text_embeddings
+                )
+            # perform guidance
+            noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+            noise_pred = noise_pred_uncond + guidance_scale * (
+                noise_pred_text - noise_pred_uncond
+            )
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = noise_scheduler.step(noise_pred, t, latents).prev_sample
+            # print(latents.shape)
+        latents_out = latents
+        print(latents_out.shape)
+        with torch.no_grad():
+            mel_out = self.autoencoderkl.decode(latents_out)
+        print(mel_out.shape)
+        melspec = mel_out[0, 0].cpu().detach().numpy()
+        plt.imsave(os.path.join(self.out_mel_path, self.args.text + ".png"), melspec)
+        self.vocoder.eval()
+        self.vocoder.remove_weight_norm()
+        with torch.no_grad():
+            melspec = np.expand_dims(melspec, 0)
+            melspec = torch.FloatTensor(melspec).to(self.args.local_rank)
+            y = self.vocoder(melspec)
+            audio = y.squeeze()
+            audio = audio * 32768.0
+            audio = audio.cpu().numpy().astype("int16")
+        write(os.path.join(self.out_wav_path, self.args.text + ".wav"), 16000, audio)

Amphion/models/tts/base/tts_inferece.py ADDED Viewed

	@@ -0,0 +1,278 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import os
+import torch
+import time
+import accelerate
+import random
+import numpy as np
+from tqdm import tqdm
+from accelerate.logging import get_logger
+from torch.utils.data import DataLoader
+from safetensors.torch import load_file
+from abc import abstractmethod
+from pathlib import Path
+from utils.io import save_audio
+from utils.util import load_config
+from models.vocoders.vocoder_inference import synthesis
+class TTSInference(object):
+    def __init__(self, args=None, cfg=None):
+        super().__init__()
+        start = time.monotonic_ns()
+        self.args = args
+        self.cfg = cfg
+        self.infer_type = args.mode
+        # get exp_dir
+        if self.args.acoustics_dir is not None:
+            self.exp_dir = self.args.acoustics_dir
+        elif self.args.checkpoint_path is not None:
+            self.exp_dir = os.path.dirname(os.path.dirname(self.args.checkpoint_path))
+        # Init accelerator
+        self.accelerator = accelerate.Accelerator()
+        self.accelerator.wait_for_everyone()
+        self.device = self.accelerator.device
+        # Get logger
+        with self.accelerator.main_process_first():
+            self.logger = get_logger("inference", log_level=args.log_level)
+        # Log some info
+        self.logger.info("=" * 56)
+        self.logger.info("||\t\t" + "New inference process started." + "\t\t||")
+        self.logger.info("=" * 56)
+        self.logger.info("\n")
+        self.acoustic_model_dir = args.acoustics_dir
+        self.logger.debug(f"Acoustic model dir: {args.acoustics_dir}")
+        if args.vocoder_dir is not None:
+            self.vocoder_dir = args.vocoder_dir
+            self.logger.debug(f"Vocoder dir: {args.vocoder_dir}")
+        os.makedirs(args.output_dir, exist_ok=True)
+        # Set random seed
+        with self.accelerator.main_process_first():
+            start = time.monotonic_ns()
+            self._set_random_seed(self.cfg.train.random_seed)
+            end = time.monotonic_ns()
+            self.logger.debug(
+                f"Setting random seed done in {(end - start) / 1e6:.2f}ms"
+            )
+            self.logger.debug(f"Random seed: {self.cfg.train.random_seed}")
+        # Setup data loader
+        if self.infer_type == "batch":
+            with self.accelerator.main_process_first():
+                self.logger.info("Building dataset...")
+                start = time.monotonic_ns()
+                self.test_dataloader = self._build_test_dataloader()
+                end = time.monotonic_ns()
+                self.logger.info(
+                    f"Building dataset done in {(end - start) / 1e6:.2f}ms"
+                )
+        # Build model
+        with self.accelerator.main_process_first():
+            self.logger.info("Building model...")
+            start = time.monotonic_ns()
+            self.model = self._build_model()
+            end = time.monotonic_ns()
+            self.logger.info(f"Building model done in {(end - start) / 1e6:.3f}ms")
+        # Init with accelerate
+        self.logger.info("Initializing accelerate...")
+        start = time.monotonic_ns()
+        self.accelerator = accelerate.Accelerator()
+        self.model = self.accelerator.prepare(self.model)
+        if self.infer_type == "batch":
+            self.test_dataloader = self.accelerator.prepare(self.test_dataloader)
+        end = time.monotonic_ns()
+        self.accelerator.wait_for_everyone()
+        self.logger.info(f"Initializing accelerate done in {(end - start) / 1e6:.3f}ms")
+        with self.accelerator.main_process_first():
+            self.logger.info("Loading checkpoint...")
+            start = time.monotonic_ns()
+            if args.acoustics_dir is not None:
+                self._load_model(
+                    checkpoint_dir=os.path.join(args.acoustics_dir, "checkpoint")
+                )
+            elif args.checkpoint_path is not None:
+                self._load_model(checkpoint_path=args.checkpoint_path)
+            else:
+                print("Either checkpoint dir or checkpoint path should be provided.")
+            end = time.monotonic_ns()
+            self.logger.info(f"Loading checkpoint done in {(end - start) / 1e6:.3f}ms")
+        self.model.eval()
+        self.accelerator.wait_for_everyone()
+    def _build_test_dataset(self):
+        pass
+    def _build_model(self):
+        pass
+    # TODO: LEGACY CODE
+    def _build_test_dataloader(self):
+        datasets, collate = self._build_test_dataset()
+        self.test_dataset = datasets(self.args, self.cfg)
+        self.test_collate = collate(self.cfg)
+        self.test_batch_size = min(
+            self.cfg.train.batch_size, len(self.test_dataset.metadata)
+        )
+        test_dataloader = DataLoader(
+            self.test_dataset,
+            collate_fn=self.test_collate,
+            num_workers=1,
+            batch_size=self.test_batch_size,
+            shuffle=False,
+        )
+        return test_dataloader
+    def _load_model(
+        self,
+        checkpoint_dir: str = None,
+        checkpoint_path: str = None,
+        old_mode: bool = False,
+    ):
+        r"""Load model from checkpoint. If checkpoint_path is None, it will
+        load the latest checkpoint in checkpoint_dir. If checkpoint_path is not
+        None, it will load the checkpoint specified by checkpoint_path. **Only use this
+        method after** ``accelerator.prepare()``.
+        """
+        if checkpoint_path is None:
+            assert checkpoint_dir is not None
+            # Load the latest accelerator state dicts
+            ls = [
+                str(i) for i in Path(checkpoint_dir).glob("*") if not "audio" in str(i)
+            ]
+            ls.sort(key=lambda x: int(x.split("_")[-3].split("-")[-1]), reverse=True)
+            checkpoint_path = ls[0]
+        if (
+            Path(os.path.join(checkpoint_path, "model.safetensors")).exists()
+            and accelerate.__version__ < "0.25"
+        ):
+            self.model.load_state_dict(
+                load_file(os.path.join(checkpoint_path, "model.safetensors")),
+                strict=False,
+            )
+        else:
+            self.accelerator.load_state(str(checkpoint_path))
+        return str(checkpoint_path)
+    def inference(self):
+        if self.infer_type == "single":
+            out_dir = os.path.join(self.args.output_dir, "single")
+            os.makedirs(out_dir, exist_ok=True)
+            pred_audio = self.inference_for_single_utterance()
+            save_path = os.path.join(out_dir, "test_pred.wav")
+            save_audio(save_path, pred_audio, self.cfg.preprocess.sample_rate)
+        elif self.infer_type == "batch":
+            out_dir = os.path.join(self.args.output_dir, "batch")
+            os.makedirs(out_dir, exist_ok=True)
+            pred_audio_list = self.inference_for_batches()
+            for it, wav in zip(self.test_dataset.metadata, pred_audio_list):
+                uid = it["Uid"]
+                save_audio(
+                    os.path.join(out_dir, f"{uid}.wav"),
+                    wav.numpy(),
+                    self.cfg.preprocess.sample_rate,
+                    add_silence=True,
+                    turn_up=True,
+                )
+                tmp_file = os.path.join(out_dir, f"{uid}.pt")
+                if os.path.exists(tmp_file):
+                    os.remove(tmp_file)
+        print("Saved to: ", out_dir)
+    @torch.inference_mode()
+    def inference_for_batches(self):
+        y_pred = []
+        for i, batch in tqdm(enumerate(self.test_dataloader)):
+            y_pred, mel_lens, _ = self._inference_each_batch(batch)
+            y_ls = y_pred.chunk(self.test_batch_size)
+            tgt_ls = mel_lens.chunk(self.test_batch_size)
+            j = 0
+            for it, l in zip(y_ls, tgt_ls):
+                l = l.item()
+                it = it.squeeze(0)[:l].detach().cpu()
+                uid = self.test_dataset.metadata[i * self.test_batch_size + j]["Uid"]
+                torch.save(it, os.path.join(self.args.output_dir, f"{uid}.pt"))
+                j += 1
+        vocoder_cfg, vocoder_ckpt = self._parse_vocoder(self.args.vocoder_dir)
+        res = synthesis(
+            cfg=vocoder_cfg,
+            vocoder_weight_file=vocoder_ckpt,
+            n_samples=None,
+            pred=[
+                torch.load(
+                    os.path.join(self.args.output_dir, "{}.pt".format(item["Uid"]))
+                ).numpy()
+                for item in self.test_dataset.metadata
+            ],
+        )
+        for it, wav in zip(self.test_dataset.metadata, res):
+            uid = it["Uid"]
+            save_audio(
+                os.path.join(self.args.output_dir, f"{uid}.wav"),
+                wav.numpy(),
+                22050,
+                add_silence=True,
+                turn_up=True,
+            )
+    @abstractmethod
+    @torch.inference_mode()
+    def _inference_each_batch(self, batch_data):
+        pass
+    def inference_for_single_utterance(self, text):
+        pass
+    def synthesis_by_vocoder(self, pred):
+        audios_pred = synthesis(
+            self.vocoder_cfg,
+            self.checkpoint_dir_vocoder,
+            len(pred),
+            pred,
+        )
+        return audios_pred
+    @staticmethod
+    def _parse_vocoder(vocoder_dir):
+        r"""Parse vocoder config"""
+        vocoder_dir = os.path.abspath(vocoder_dir)
+        ckpt_list = [ckpt for ckpt in Path(vocoder_dir).glob("*.pt")]
+        ckpt_list.sort(key=lambda x: int(x.stem), reverse=True)
+        ckpt_path = str(ckpt_list[0])
+        vocoder_cfg = load_config(
+            os.path.join(vocoder_dir, "args.json"), lowercase=True
+        )
+        return vocoder_cfg, ckpt_path
+    def _set_random_seed(self, seed):
+        """Set random seed for all possible random modules."""
+        random.seed(seed)
+        np.random.seed(seed)
+        torch.random.manual_seed(seed)

Amphion/models/tts/fastspeech2/fs2_dataset.py ADDED Viewed

	@@ -0,0 +1,424 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import random
+import torch
+from torch.nn.utils.rnn import pad_sequence
+from utils.data_utils import *
+from models.base.base_dataset import (
+    BaseOfflineCollator,
+    BaseOfflineDataset,
+    BaseTestDataset,
+    BaseTestCollator,
+)
+from text import text_to_sequence
+class FS2Dataset(BaseOfflineDataset):
+    def __init__(self, cfg, dataset, is_valid=False):
+        BaseOfflineDataset.__init__(self, cfg, dataset, is_valid=is_valid)
+        self.batch_size = cfg.train.batch_size
+        cfg = cfg.preprocess
+        # utt2duration
+        self.utt2duration_path = {}
+        for utt_info in self.metadata:
+            dataset = utt_info["Dataset"]
+            uid = utt_info["Uid"]
+            utt = "{}_{}".format(dataset, uid)
+            self.utt2duration_path[utt] = os.path.join(
+                cfg.processed_dir,
+                dataset,
+                cfg.duration_dir,
+                uid + ".npy",
+            )
+        self.utt2dur = self.read_duration()
+        if cfg.use_frame_energy:
+            self.frame_utt2energy, self.energy_statistic = load_energy(
+                self.metadata,
+                cfg.processed_dir,
+                cfg.energy_dir,
+                use_log_scale=cfg.use_log_scale_energy,
+                utt2spk=self.preprocess.utt2spk if cfg.use_spkid else None,
+                return_norm=True,
+            )
+        elif cfg.use_phone_energy:
+            self.phone_utt2energy, self.energy_statistic = load_energy(
+                self.metadata,
+                cfg.processed_dir,
+                cfg.phone_energy_dir,
+                use_log_scale=cfg.use_log_scale_energy,
+                utt2spk=self.utt2spk if cfg.use_spkid else None,
+                return_norm=True,
+            )
+        if cfg.use_frame_pitch:
+            self.frame_utt2pitch, self.pitch_statistic = load_energy(
+                self.metadata,
+                cfg.processed_dir,
+                cfg.pitch_dir,
+                use_log_scale=cfg.energy_extract_mode,
+                utt2spk=self.utt2spk if cfg.use_spkid else None,
+                return_norm=True,
+            )
+        elif cfg.use_phone_pitch:
+            self.phone_utt2pitch, self.pitch_statistic = load_energy(
+                self.metadata,
+                cfg.processed_dir,
+                cfg.phone_pitch_dir,
+                use_log_scale=cfg.use_log_scale_pitch,
+                utt2spk=self.utt2spk if cfg.use_spkid else None,
+                return_norm=True,
+            )
+        # utt2lab
+        self.utt2lab_path = {}
+        for utt_info in self.metadata:
+            dataset = utt_info["Dataset"]
+            uid = utt_info["Uid"]
+            utt = "{}_{}".format(dataset, uid)
+            self.utt2lab_path[utt] = os.path.join(
+                cfg.processed_dir,
+                dataset,
+                cfg.lab_dir,
+                uid + ".txt",
+            )
+        self.speaker_map = {}
+        if os.path.exists(os.path.join(cfg.processed_dir, "spk2id.json")):
+            with open(
+                os.path.exists(os.path.join(cfg.processed_dir, "spk2id.json"))
+            ) as f:
+                self.speaker_map = json.load(f)
+        self.metadata = self.check_metadata()
+    def __getitem__(self, index):
+        single_feature = BaseOfflineDataset.__getitem__(self, index)
+        utt_info = self.metadata[index]
+        dataset = utt_info["Dataset"]
+        uid = utt_info["Uid"]
+        utt = "{}_{}".format(dataset, uid)
+        duration = self.utt2dur[utt]
+        # text
+        f = open(self.utt2lab_path[utt], "r")
+        phones = f.readlines()[0].strip()
+        f.close()
+        # todo: add cleaner(chenxi)
+        phones_ids = np.array(text_to_sequence(phones, ["english_cleaners"]))
+        text_len = len(phones_ids)
+        if self.cfg.preprocess.use_frame_pitch:
+            pitch = self.frame_utt2pitch[utt]
+        elif self.cfg.preprocess.use_phone_pitch:
+            pitch = self.phone_utt2pitch[utt]
+        if self.cfg.preprocess.use_frame_energy:
+            energy = self.frame_utt2energy[utt]
+        elif self.cfg.preprocess.use_phone_energy:
+            energy = self.phone_utt2energy[utt]
+        # speaker
+        if len(self.speaker_map) > 0:
+            speaker_id = self.speaker_map[utt_info["Singer"]]
+        else:
+            speaker_id = 0
+        single_feature.update(
+            {
+                "durations": duration,
+                "texts": phones_ids,
+                "spk_id": speaker_id,
+                "text_len": text_len,
+                "pitch": pitch,
+                "energy": energy,
+                "uid": uid,
+            }
+        )
+        return self.clip_if_too_long(single_feature)
+    def read_duration(self):
+        # read duration
+        utt2dur = {}
+        for index in range(len(self.metadata)):
+            utt_info = self.metadata[index]
+            dataset = utt_info["Dataset"]
+            uid = utt_info["Uid"]
+            utt = "{}_{}".format(dataset, uid)
+            if not os.path.exists(self.utt2mel_path[utt]) or not os.path.exists(
+                self.utt2duration_path[utt]
+            ):
+                continue
+            mel = np.load(self.utt2mel_path[utt]).transpose(1, 0)
+            duration = np.load(self.utt2duration_path[utt])
+            assert mel.shape[0] == sum(
+                duration
+            ), f"{utt}: mismatch length between mel {mel.shape[0]} and sum(duration) {sum(duration)}"
+            utt2dur[utt] = duration
+        return utt2dur
+    def __len__(self):
+        return len(self.metadata)
+    def random_select(self, feature_seq_len, max_seq_len, ending_ts=2812):
+        """
+        ending_ts: to avoid invalid whisper features for over 30s audios
+            2812 = 30 * 24000 // 256
+        """
+        ts = max(feature_seq_len - max_seq_len, 0)
+        ts = min(ts, ending_ts - max_seq_len)
+        start = random.randint(0, ts)
+        end = start + max_seq_len
+        return start, end
+    def clip_if_too_long(self, sample, max_seq_len=1000):
+        """
+        sample :
+            {
+                'spk_id': (1,),
+                'target_len': int
+                'mel': (seq_len, dim),
+                'frame_pitch': (seq_len,)
+                'frame_energy': (seq_len,)
+                'content_vector_feat': (seq_len, dim)
+            }
+        """
+        if sample["target_len"] <= max_seq_len:
+            return sample
+        start, end = self.random_select(sample["target_len"], max_seq_len)
+        sample["target_len"] = end - start
+        for k in sample.keys():
+            if k not in ["spk_id", "target_len"]:
+                sample[k] = sample[k][start:end]
+        return sample
+    def check_metadata(self):
+        new_metadata = []
+        for utt_info in self.metadata:
+            dataset = utt_info["Dataset"]
+            uid = utt_info["Uid"]
+            utt = "{}_{}".format(dataset, uid)
+            if not os.path.exists(self.utt2duration_path[utt]) or not os.path.exists(
+                self.utt2mel_path[utt]
+            ):
+                continue
+            else:
+                new_metadata.append(utt_info)
+        return new_metadata
+class FS2Collator(BaseOfflineCollator):
+    """Zero-pads model inputs and targets based on number of frames per step"""
+    def __init__(self, cfg):
+        BaseOfflineCollator.__init__(self, cfg)
+        self.sort = cfg.train.sort_sample
+        self.batch_size = cfg.train.batch_size
+        self.drop_last = cfg.train.drop_last
+    def __call__(self, batch):
+        # mel: [b, T, n_mels]
+        # frame_pitch, frame_energy: [1, T]
+        # target_len: [1]
+        # spk_id: [b, 1]
+        # mask: [b, T, 1]
+        packed_batch_features = dict()
+        for key in batch[0].keys():
+            if key == "target_len":
+                packed_batch_features["target_len"] = torch.LongTensor(
+                    [b["target_len"] for b in batch]
+                )
+                masks = [
+                    torch.ones((b["target_len"], 1), dtype=torch.long) for b in batch
+                ]
+                packed_batch_features["mask"] = pad_sequence(
+                    masks, batch_first=True, padding_value=0
+                )
+            elif key == "text_len":
+                packed_batch_features["text_len"] = torch.LongTensor(
+                    [b["text_len"] for b in batch]
+                )
+                masks = [
+                    torch.ones((b["text_len"], 1), dtype=torch.long) for b in batch
+                ]
+                packed_batch_features["text_mask"] = pad_sequence(
+                    masks, batch_first=True, padding_value=0
+                )
+            elif key == "spk_id":
+                packed_batch_features["spk_id"] = torch.LongTensor(
+                    [b["spk_id"] for b in batch]
+                )
+            elif key == "uid":
+                packed_batch_features[key] = [b["uid"] for b in batch]
+            else:
+                values = [torch.from_numpy(b[key]) for b in batch]
+                packed_batch_features[key] = pad_sequence(
+                    values, batch_first=True, padding_value=0
+                )
+        return packed_batch_features
+class FS2TestDataset(BaseTestDataset):
+    def __init__(self, args, cfg, infer_type=None):
+        datasets = cfg.dataset
+        cfg = cfg.preprocess
+        is_bigdata = False
+        assert len(datasets) >= 1
+        if len(datasets) > 1:
+            datasets.sort()
+            bigdata_version = "_".join(datasets)
+            processed_data_dir = os.path.join(cfg.processed_dir, bigdata_version)
+            is_bigdata = True
+        else:
+            processed_data_dir = os.path.join(cfg.processed_dir, args.dataset)
+        if args.test_list_file:
+            self.metafile_path = args.test_list_file
+            self.metadata = self.get_metadata()
+        else:
+            assert args.testing_set
+            source_metafile_path = os.path.join(
+                cfg.processed_dir,
+                args.dataset,
+                "{}.json".format(args.testing_set),
+            )
+            with open(source_metafile_path, "r") as f:
+                self.metadata = json.load(f)
+        self.cfg = cfg
+        self.datasets = datasets
+        self.data_root = processed_data_dir
+        self.is_bigdata = is_bigdata
+        self.source_dataset = args.dataset
+        ######### Load source acoustic features #########
+        if cfg.use_spkid:
+            spk2id_path = os.path.join(self.data_root, cfg.spk2id)
+            utt2sp_path = os.path.join(self.data_root, cfg.utt2spk)
+            self.spk2id, self.utt2spk = get_spk_map(spk2id_path, utt2sp_path, datasets)
+        # utt2lab
+        self.utt2lab_path = {}
+        for utt_info in self.metadata:
+            dataset = utt_info["Dataset"]
+            uid = utt_info["Uid"]
+            utt = "{}_{}".format(dataset, uid)
+            self.utt2lab_path[utt] = os.path.join(
+                cfg.processed_dir,
+                dataset,
+                cfg.lab_dir,
+                uid + ".txt",
+            )
+        self.speaker_map = {}
+        if os.path.exists(os.path.join(cfg.processed_dir, "spk2id.json")):
+            with open(
+                os.path.exists(os.path.join(cfg.processed_dir, "spk2id.json"))
+            ) as f:
+                self.speaker_map = json.load(f)
+    def __getitem__(self, index):
+        single_feature = {}
+        utt_info = self.metadata[index]
+        dataset = utt_info["Dataset"]
+        uid = utt_info["Uid"]
+        utt = "{}_{}".format(dataset, uid)
+        # text
+        f = open(self.utt2lab_path[utt], "r")
+        phones = f.readlines()[0].strip()
+        f.close()
+        phones_ids = np.array(text_to_sequence(phones, self.cfg.text_cleaners))
+        text_len = len(phones_ids)
+        # speaker
+        if len(self.speaker_map) > 0:
+            speaker_id = self.speaker_map[utt_info["Singer"]]
+        else:
+            speaker_id = 0
+        single_feature.update(
+            {
+                "texts": phones_ids,
+                "spk_id": speaker_id,
+                "text_len": text_len,
+            }
+        )
+        return single_feature
+    def __len__(self):
+        return len(self.metadata)
+    def get_metadata(self):
+        with open(self.metafile_path, "r", encoding="utf-8") as f:
+            metadata = json.load(f)
+        return metadata
+class FS2TestCollator(BaseTestCollator):
+    """Zero-pads model inputs and targets based on number of frames per step"""
+    def __init__(self, cfg):
+        self.cfg = cfg
+    def __call__(self, batch):
+        packed_batch_features = dict()
+        # mel: [b, T, n_mels]
+        # frame_pitch, frame_energy: [1, T]
+        # target_len: [1]
+        # spk_id: [b, 1]
+        # mask: [b, T, 1]
+        for key in batch[0].keys():
+            if key == "target_len":
+                packed_batch_features["target_len"] = torch.LongTensor(
+                    [b["target_len"] for b in batch]
+                )
+                masks = [
+                    torch.ones((b["target_len"], 1), dtype=torch.long) for b in batch
+                ]
+                packed_batch_features["mask"] = pad_sequence(
+                    masks, batch_first=True, padding_value=0
+                )
+            elif key == "text_len":
+                packed_batch_features["text_len"] = torch.LongTensor(
+                    [b["text_len"] for b in batch]
+                )
+                masks = [
+                    torch.ones((b["text_len"], 1), dtype=torch.long) for b in batch
+                ]
+                packed_batch_features["text_mask"] = pad_sequence(
+                    masks, batch_first=True, padding_value=0
+                )
+            elif key == "spk_id":
+                packed_batch_features["spk_id"] = torch.LongTensor(
+                    [b["spk_id"] for b in batch]
+                )
+            else:
+                values = [torch.from_numpy(b[key]) for b in batch]
+                packed_batch_features[key] = pad_sequence(
+                    values, batch_first=True, padding_value=0
+                )
+        return packed_batch_features

Amphion/models/tts/naturalspeech2/diffusion.py ADDED Viewed

	@@ -0,0 +1,124 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import torch.nn as nn
+import numpy as np
+import torch.nn.functional as F
+from models.tts.naturalspeech2.wavenet import WaveNet
+class Diffusion(nn.Module):
+    def __init__(self, cfg):
+        super().__init__()
+        self.cfg = cfg
+        self.diff_estimator = WaveNet(cfg.wavenet)
+        self.beta_min = cfg.beta_min
+        self.beta_max = cfg.beta_max
+        self.sigma = cfg.sigma
+        self.noise_factor = cfg.noise_factor
+    def forward(self, x, x_mask, cond, spk_query_emb, offset=1e-5):
+        """
+        x: (B, 128, T)
+        x_mask: (B, T), mask is 0
+        cond: (B, T, 512)
+        spk_query_emb: (B, 32, 512)
+        """
+        diffusion_step = torch.rand(
+            x.shape[0], dtype=x.dtype, device=x.device, requires_grad=False
+        )
+        diffusion_step = torch.clamp(diffusion_step, offset, 1.0 - offset)
+        xt, z = self.forward_diffusion(x0=x, diffusion_step=diffusion_step)
+        cum_beta = self.get_cum_beta(diffusion_step.unsqueeze(-1).unsqueeze(-1))
+        x0_pred = self.diff_estimator(xt, x_mask, cond, diffusion_step, spk_query_emb)
+        mean_pred = x0_pred * torch.exp(-0.5 * cum_beta / (self.sigma**2))
+        variance = (self.sigma**2) * (1.0 - torch.exp(-cum_beta / (self.sigma**2)))
+        noise_pred = (xt - mean_pred) / (torch.sqrt(variance) * self.noise_factor)
+        noise = z
+        diff_out = {"x0_pred": x0_pred, "noise_pred": noise_pred, "noise": noise}
+        return diff_out
+    @torch.no_grad()
+    def get_cum_beta(self, time_step):
+        return self.beta_min * time_step + 0.5 * (self.beta_max - self.beta_min) * (
+            time_step**2
+        )
+    @torch.no_grad()
+    def get_beta_t(self, time_step):
+        return self.beta_min + (self.beta_max - self.beta_min) * time_step
+    @torch.no_grad()
+    def forward_diffusion(self, x0, diffusion_step):
+        """
+        x0: (B, 128, T)
+        time_step: (B,)
+        """
+        time_step = diffusion_step.unsqueeze(-1).unsqueeze(-1)
+        cum_beta = self.get_cum_beta(time_step)
+        mean = x0 * torch.exp(-0.5 * cum_beta / (self.sigma**2))
+        variance = (self.sigma**2) * (1 - torch.exp(-cum_beta / (self.sigma**2)))
+        z = torch.randn(x0.shape, dtype=x0.dtype, device=x0.device, requires_grad=False)
+        xt = mean + z * torch.sqrt(variance) * self.noise_factor
+        return xt, z
+    @torch.no_grad()
+    def cal_dxt(self, xt, x_mask, cond, spk_query_emb, diffusion_step, h):
+        time_step = diffusion_step.unsqueeze(-1).unsqueeze(-1)
+        cum_beta = self.get_cum_beta(time_step=time_step)
+        beta_t = self.get_beta_t(time_step=time_step)
+        x0_pred = self.diff_estimator(xt, x_mask, cond, diffusion_step, spk_query_emb)
+        mean_pred = x0_pred * torch.exp(-0.5 * cum_beta / (self.sigma**2))
+        noise_pred = xt - mean_pred
+        variance = (self.sigma**2) * (1.0 - torch.exp(-cum_beta / (self.sigma**2)))
+        logp = -noise_pred / (variance + 1e-8)
+        dxt = -0.5 * h * beta_t * (logp + xt / (self.sigma**2))
+        return dxt
+    @torch.no_grad()
+    def reverse_diffusion(self, z, x_mask, cond, n_timesteps, spk_query_emb):
+        h = 1.0 / max(n_timesteps, 1)
+        xt = z
+        for i in range(n_timesteps):
+            t = (1.0 - (i + 0.5) * h) * torch.ones(
+                z.shape[0], dtype=z.dtype, device=z.device
+            )
+            dxt = self.cal_dxt(xt, x_mask, cond, spk_query_emb, diffusion_step=t, h=h)
+            xt_ = xt - dxt
+            if self.cfg.ode_solver == "midpoint":
+                x_mid = 0.5 * (xt_ + xt)
+                dxt = self.cal_dxt(
+                    x_mid, x_mask, cond, spk_query_emb, diffusion_step=t + 0.5 * h, h=h
+                )
+                xt = xt - dxt
+            elif self.cfg.ode_solver == "euler":
+                xt = xt_
+        return xt
+    @torch.no_grad()
+    def reverse_diffusion_from_t(
+        self, z, x_mask, cond, n_timesteps, spk_query_emb, t_start
+    ):
+        h = t_start / max(n_timesteps, 1)
+        xt = z
+        for i in range(n_timesteps):
+            t = (t_start - (i + 0.5) * h) * torch.ones(
+                z.shape[0], dtype=z.dtype, device=z.device
+            )
+            dxt = self.cal_dxt(xt, x_mask, cond, spk_query_emb, diffusion_step=t, h=h)
+            xt_ = xt - dxt
+            if self.cfg.ode_solver == "midpoint":
+                x_mid = 0.5 * (xt_ + xt)
+                dxt = self.cal_dxt(
+                    x_mid, x_mask, cond, spk_query_emb, diffusion_step=t + 0.5 * h, h=h
+                )
+                xt = xt - dxt
+            elif self.cfg.ode_solver == "euler":
+                xt = xt_
+        return xt

Amphion/models/tts/vits/vits_inference.py ADDED Viewed

	@@ -0,0 +1,163 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import os
+import time
+import numpy as np
+from tqdm import tqdm
+import torch
+import json
+from models.tts.base.tts_inferece import TTSInference
+from models.tts.vits.vits_dataset import VITSTestDataset, VITSTestCollator
+from models.tts.vits.vits import SynthesizerTrn
+from processors.phone_extractor import phoneExtractor
+from text.text_token_collation import phoneIDCollation
+from utils.data_utils import *
+class VitsInference(TTSInference):
+    def __init__(self, args=None, cfg=None):
+        TTSInference.__init__(self, args, cfg)
+    def _build_model(self):
+        net_g = SynthesizerTrn(
+            self.cfg.model.text_token_num,
+            self.cfg.preprocess.n_fft // 2 + 1,
+            self.cfg.preprocess.segment_size // self.cfg.preprocess.hop_size,
+            **self.cfg.model,
+        )
+        return net_g
+    def _build_test_dataset(sefl):
+        return VITSTestDataset, VITSTestCollator
+    def build_save_dir(self, dataset, speaker):
+        save_dir = os.path.join(
+            self.args.output_dir,
+            "tts_am_step-{}_{}".format(self.am_restore_step, self.args.mode),
+        )
+        if dataset is not None:
+            save_dir = os.path.join(save_dir, "data_{}".format(dataset))
+        if speaker != -1:
+            save_dir = os.path.join(
+                save_dir,
+                "spk_{}".format(speaker),
+            )
+        os.makedirs(save_dir, exist_ok=True)
+        print("Saving to ", save_dir)
+        return save_dir
+    def inference_for_batches(
+        self, noise_scale=0.667, noise_scale_w=0.8, length_scale=1
+    ):
+        ###### Construct test_batch ######
+        n_batch = len(self.test_dataloader)
+        now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time()))
+        print(
+            "Model eval time: {}, batch_size = {}, n_batch = {}".format(
+                now, self.test_batch_size, n_batch
+            )
+        )
+        self.model.eval()
+        ###### Inference for each batch ######
+        pred_res = []
+        with torch.no_grad():
+            for i, batch_data in enumerate(
+                self.test_dataloader if n_batch == 1 else tqdm(self.test_dataloader)
+            ):
+                spk_id = None
+                if (
+                    self.cfg.preprocess.use_spkid
+                    and self.cfg.train.multi_speaker_training
+                ):
+                    spk_id = batch_data["spk_id"]
+                outputs = self.model.infer(
+                    batch_data["phone_seq"],
+                    batch_data["phone_len"],
+                    spk_id,
+                    noise_scale=noise_scale,
+                    noise_scale_w=noise_scale_w,
+                    length_scale=length_scale,
+                )
+                audios = outputs["y_hat"]
+                masks = outputs["mask"]
+                for idx in range(audios.size(0)):
+                    audio = audios[idx, 0, :].data.cpu().float()
+                    mask = masks[idx, :, :]
+                    audio_length = (
+                        mask.sum([0, 1]).long() * self.cfg.preprocess.hop_size
+                    )
+                    audio_length = audio_length.cpu().numpy()
+                    audio = audio[:audio_length]
+                    pred_res.append(audio)
+        return pred_res
+    def inference_for_single_utterance(
+        self, noise_scale=0.667, noise_scale_w=0.8, length_scale=1
+    ):
+        text = self.args.text
+        # get phone symbol file
+        phone_symbol_file = None
+        if self.cfg.preprocess.phone_extractor != "lexicon":
+            phone_symbol_file = os.path.join(
+                self.exp_dir, self.cfg.preprocess.symbols_dict
+            )
+            assert os.path.exists(phone_symbol_file)
+        # convert text to phone sequence
+        phone_extractor = phoneExtractor(self.cfg)
+        phone_seq = phone_extractor.extract_phone(text)  # phone_seq: list
+        # convert phone sequence to phone id sequence
+        phon_id_collator = phoneIDCollation(
+            self.cfg, symbols_dict_file=phone_symbol_file
+        )
+        phone_id_seq = phon_id_collator.get_phone_id_sequence(self.cfg, phone_seq)
+        if self.cfg.preprocess.add_blank:
+            phone_id_seq = intersperse(phone_id_seq, 0)
+        # convert phone sequence to phone id sequence
+        phone_id_seq = np.array(phone_id_seq)
+        phone_id_seq = torch.from_numpy(phone_id_seq)
+        # get speaker id if multi-speaker training and use speaker id
+        speaker_id = None
+        if self.cfg.preprocess.use_spkid and self.cfg.train.multi_speaker_training:
+            spk2id_file = os.path.join(self.exp_dir, self.cfg.preprocess.spk2id)
+            with open(spk2id_file, "r") as f:
+                spk2id = json.load(f)
+                speaker_name = self.args.speaker_name
+                assert (
+                    speaker_name in spk2id
+                ), f"Speaker {speaker_name} not found in the spk2id keys. \
+                    Please make sure you've specified the correct speaker name in infer_speaker_name."
+                speaker_id = spk2id[speaker_name]
+                speaker_id = torch.from_numpy(
+                    np.array([speaker_id], dtype=np.int32)
+                ).unsqueeze(0)
+        with torch.no_grad():
+            x_tst = phone_id_seq.to(self.device).unsqueeze(0)
+            x_tst_lengths = torch.LongTensor([phone_id_seq.size(0)]).to(self.device)
+            if speaker_id is not None:
+                speaker_id = speaker_id.to(self.device)
+            outputs = self.model.infer(
+                x_tst,
+                x_tst_lengths,
+                sid=speaker_id,
+                noise_scale=noise_scale,
+                noise_scale_w=noise_scale_w,
+                length_scale=length_scale,
+            )
+            audio = outputs["y_hat"][0, 0].data.cpu().float().numpy()
+        return audio

Amphion/models/vocoders/flow/flow_vocoder_trainer.py ADDED Viewed

File without changes

Amphion/models/vocoders/gan/gan_vocoder_dataset.py ADDED Viewed

	@@ -0,0 +1,205 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import random
+import numpy as np
+from torch.nn import functional as F
+from torch.nn.utils.rnn import pad_sequence
+from utils.data_utils import *
+from models.vocoders.vocoder_dataset import VocoderDataset
+class GANVocoderDataset(VocoderDataset):
+    def __init__(self, cfg, dataset, is_valid=False):
+        """
+        Args:
+            cfg: config
+            dataset: dataset name
+            is_valid: whether to use train or valid dataset
+        """
+        super().__init__(cfg, dataset, is_valid)
+        eval_index = random.randint(0, len(self.metadata) - 1)
+        eval_utt_info = self.metadata[eval_index]
+        eval_utt = "{}_{}".format(eval_utt_info["Dataset"], eval_utt_info["Uid"])
+        self.eval_audio = np.load(self.utt2audio_path[eval_utt])
+        if cfg.preprocess.use_mel:
+            self.eval_mel = np.load(self.utt2mel_path[eval_utt])
+        if cfg.preprocess.use_frame_pitch:
+            self.eval_pitch = np.load(self.utt2frame_pitch_path[eval_utt])
+    def __getitem__(self, index):
+        utt_info = self.metadata[index]
+        dataset = utt_info["Dataset"]
+        uid = utt_info["Uid"]
+        utt = "{}_{}".format(dataset, uid)
+        single_feature = dict()
+        if self.cfg.preprocess.use_mel:
+            mel = np.load(self.utt2mel_path[utt])
+            assert mel.shape[0] == self.cfg.preprocess.n_mel
+            if "target_len" not in single_feature.keys():
+                single_feature["target_len"] = mel.shape[1]
+            if single_feature["target_len"] <= self.cfg.preprocess.cut_mel_frame:
+                mel = np.pad(
+                    mel,
+                    ((0, 0), (0, self.cfg.preprocess.cut_mel_frame - mel.shape[-1])),
+                    mode="constant",
+                )
+            else:
+                if "start" not in single_feature.keys():
+                    start = random.randint(
+                        0, mel.shape[-1] - self.cfg.preprocess.cut_mel_frame
+                    )
+                    end = start + self.cfg.preprocess.cut_mel_frame
+                    single_feature["start"] = start
+                    single_feature["end"] = end
+                mel = mel[:, single_feature["start"] : single_feature["end"]]
+            single_feature["mel"] = mel
+        if self.cfg.preprocess.use_frame_pitch:
+            frame_pitch = np.load(self.utt2frame_pitch_path[utt])
+            if "target_len" not in single_feature.keys():
+                single_feature["target_len"] = len(frame_pitch)
+            aligned_frame_pitch = align_length(
+                frame_pitch, single_feature["target_len"]
+            )
+            if single_feature["target_len"] <= self.cfg.preprocess.cut_mel_frame:
+                aligned_frame_pitch = np.pad(
+                    aligned_frame_pitch,
+                    (
+                        (
+                            0,
+                            self.cfg.preprocess.cut_mel_frame
+                            * self.cfg.preprocess.hop_size
+                            - audio.shape[-1],
+                        )
+                    ),
+                    mode="constant",
+                )
+            else:
+                if "start" not in single_feature.keys():
+                    start = random.randint(
+                        0,
+                        aligned_frame_pitch.shape[-1]
+                        - self.cfg.preprocess.cut_mel_frame,
+                    )
+                    end = start + self.cfg.preprocess.cut_mel_frame
+                    single_feature["start"] = start
+                    single_feature["end"] = end
+                aligned_frame_pitch = aligned_frame_pitch[
+                    single_feature["start"] : single_feature["end"]
+                ]
+            single_feature["frame_pitch"] = aligned_frame_pitch
+        if self.cfg.preprocess.use_audio:
+            audio = np.load(self.utt2audio_path[utt])
+            assert "target_len" in single_feature.keys()
+            if (
+                audio.shape[-1]
+                <= self.cfg.preprocess.cut_mel_frame * self.cfg.preprocess.hop_size
+            ):
+                audio = np.pad(
+                    audio,
+                    (
+                        (
+                            0,
+                            self.cfg.preprocess.cut_mel_frame
+                            * self.cfg.preprocess.hop_size
+                            - audio.shape[-1],
+                        )
+                    ),
+                    mode="constant",
+                )
+            else:
+                if "start" not in single_feature.keys():
+                    audio = audio[
+                        0 : self.cfg.preprocess.cut_mel_frame
+                        * self.cfg.preprocess.hop_size
+                    ]
+                else:
+                    audio = audio[
+                        single_feature["start"]
+                        * self.cfg.preprocess.hop_size : single_feature["end"]
+                        * self.cfg.preprocess.hop_size,
+                    ]
+            single_feature["audio"] = audio
+        if self.cfg.preprocess.use_amplitude_phase:
+            logamp = np.load(self.utt2logamp_path[utt])
+            pha = np.load(self.utt2pha_path[utt])
+            rea = np.load(self.utt2rea_path[utt])
+            imag = np.load(self.utt2imag_path[utt])
+            assert "target_len" in single_feature.keys()
+            if single_feature["target_len"] <= self.cfg.preprocess.cut_mel_frame:
+                logamp = np.pad(
+                    logamp,
+                    ((0, 0), (0, self.cfg.preprocess.cut_mel_frame - mel.shape[-1])),
+                    mode="constant",
+                )
+                pha = np.pad(
+                    pha,
+                    ((0, 0), (0, self.cfg.preprocess.cut_mel_frame - mel.shape[-1])),
+                    mode="constant",
+                )
+                rea = np.pad(
+                    rea,
+                    ((0, 0), (0, self.cfg.preprocess.cut_mel_frame - mel.shape[-1])),
+                    mode="constant",
+                )
+                imag = np.pad(
+                    imag,
+                    ((0, 0), (0, self.cfg.preprocess.cut_mel_frame - mel.shape[-1])),
+                    mode="constant",
+                )
+            else:
+                logamp = logamp[:, single_feature["start"] : single_feature["end"]]
+                pha = pha[:, single_feature["start"] : single_feature["end"]]
+                rea = rea[:, single_feature["start"] : single_feature["end"]]
+                imag = imag[:, single_feature["start"] : single_feature["end"]]
+            single_feature["logamp"] = logamp
+            single_feature["pha"] = pha
+            single_feature["rea"] = rea
+            single_feature["imag"] = imag
+        return single_feature
+class GANVocoderCollator(object):
+    """Zero-pads model inputs and targets based on number of frames per step"""
+    def __init__(self, cfg):
+        self.cfg = cfg
+    def __call__(self, batch):
+        packed_batch_features = dict()
+        # mel: [b, n_mels, frame]
+        # frame_pitch: [b, frame]
+        # audios: [b, frame * hop_size]
+        for key in batch[0].keys():
+            if key in ["target_len", "start", "end"]:
+                continue
+            else:
+                values = [torch.from_numpy(b[key]) for b in batch]
+                packed_batch_features[key] = pad_sequence(
+                    values, batch_first=True, padding_value=0
+                )
+        return packed_batch_features

Amphion/modules/anti_aliasing/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from .act import *
+from .filter import *
+from .resample import *

Amphion/modules/encoder/condition_encoder.py ADDED Viewed

	@@ -0,0 +1,244 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import numpy as np
+import torch
+import torch.nn as nn
+from torchaudio.models import Conformer
+from models.svc.transformer.transformer import PositionalEncoding
+from utils.f0 import f0_to_coarse
+class ContentEncoder(nn.Module):
+    def __init__(self, cfg, input_dim, output_dim):
+        super().__init__()
+        self.cfg = cfg
+        assert input_dim != 0
+        self.nn = nn.Linear(input_dim, output_dim)
+        # Introduce conformer or not
+        if (
+            "use_conformer_for_content_features" in cfg
+            and cfg.use_conformer_for_content_features
+        ):
+            self.pos_encoder = PositionalEncoding(input_dim)
+            self.conformer = Conformer(
+                input_dim=input_dim,
+                num_heads=2,
+                ffn_dim=256,
+                num_layers=6,
+                depthwise_conv_kernel_size=3,
+            )
+        else:
+            self.conformer = None
+    def forward(self, x, length=None):
+        # x: (N, seq_len, input_dim) -> (N, seq_len, output_dim)
+        if self.conformer:
+            x = self.pos_encoder(x)
+            x, _ = self.conformer(x, length)
+        return self.nn(x)
+class MelodyEncoder(nn.Module):
+    def __init__(self, cfg):
+        super().__init__()
+        self.cfg = cfg
+        self.input_dim = self.cfg.input_melody_dim
+        self.output_dim = self.cfg.output_melody_dim
+        self.n_bins = self.cfg.n_bins_melody
+        if self.input_dim != 0:
+            if self.n_bins == 0:
+                # Not use quantization
+                self.nn = nn.Linear(self.input_dim, self.output_dim)
+            else:
+                self.f0_min = cfg.f0_min
+                self.f0_max = cfg.f0_max
+                self.nn = nn.Embedding(
+                    num_embeddings=self.n_bins,
+                    embedding_dim=self.output_dim,
+                    padding_idx=None,
+                )
+                self.uv_embedding = nn.Embedding(2, self.output_dim)
+    def forward(self, x, uv=None, length=None):
+        # x: (B, frame_len)
+        if self.n_bins == 0:
+            x = x.unsqueeze(-1)
+        else:
+            x = f0_to_coarse(x, self.n_bins, self.f0_min, self.f0_max)
+            x = self.nn(x)
+            if self.cfg.use_uv:
+                uv = self.uv_embedding(uv)
+                x = x + uv
+        return x
+class LoudnessEncoder(nn.Module):
+    def __init__(self, cfg):
+        super().__init__()
+        self.cfg = cfg
+        self.input_dim = self.cfg.input_loudness_dim
+        self.output_dim = self.cfg.output_loudness_dim
+        self.n_bins = self.cfg.n_bins_loudness
+        if self.input_dim != 0:
+            if self.n_bins == 0:
+                # Not use quantization
+                self.nn = nn.Linear(self.input_dim, self.output_dim)
+            else:
+                # TODO: set empirically now
+                self.loudness_min = 1e-30
+                self.loudness_max = 1.5
+                self.energy_bins = nn.Parameter(
+                    torch.exp(
+                        torch.linspace(
+                            np.log(self.loudness_min),
+                            np.log(self.loudness_max),
+                            self.n_bins - 1,
+                        )
+                    ),
+                    requires_grad=False,
+                )
+                self.nn = nn.Embedding(
+                    num_embeddings=self.n_bins,
+                    embedding_dim=self.output_dim,
+                    padding_idx=None,
+                )
+    def forward(self, x):
+        # x: (N, frame_len)
+        if self.n_bins == 0:
+            x = x.unsqueeze(-1)
+        else:
+            x = torch.bucketize(x, self.energy_bins)
+        return self.nn(x)
+class SingerEncoder(nn.Module):
+    def __init__(self, cfg):
+        super().__init__()
+        self.cfg = cfg
+        self.input_dim = 1
+        self.output_dim = self.cfg.output_singer_dim
+        self.nn = nn.Embedding(
+            num_embeddings=cfg.singer_table_size,
+            embedding_dim=self.output_dim,
+            padding_idx=None,
+        )
+    def forward(self, x):
+        # x: (N, 1) -> (N, 1, output_dim)
+        return self.nn(x)
+class ConditionEncoder(nn.Module):
+    def __init__(self, cfg):
+        super().__init__()
+        self.cfg = cfg
+        self.merge_mode = cfg.merge_mode
+        ### Semantic Features ###
+        if cfg.use_whisper:
+            self.whisper_encoder = ContentEncoder(
+                self.cfg, self.cfg.whisper_dim, self.cfg.content_encoder_dim
+            )
+        if cfg.use_contentvec:
+            self.contentvec_encoder = ContentEncoder(
+                self.cfg, self.cfg.contentvec_dim, self.cfg.content_encoder_dim
+            )
+        if cfg.use_mert:
+            self.mert_encoder = ContentEncoder(
+                self.cfg, self.cfg.mert_dim, self.cfg.content_encoder_dim
+            )
+        if cfg.use_wenet:
+            self.wenet_encoder = ContentEncoder(
+                self.cfg, self.cfg.wenet_dim, self.cfg.content_encoder_dim
+            )
+        ### Prosody Features ###
+        if cfg.use_f0:
+            self.melody_encoder = MelodyEncoder(self.cfg)
+        if cfg.use_energy:
+            self.loudness_encoder = LoudnessEncoder(self.cfg)
+        ### Speaker Features ###
+        if cfg.use_spkid:
+            self.singer_encoder = SingerEncoder(self.cfg)
+    def forward(self, x):
+        outputs = []
+        if self.cfg.use_f0:
+            if self.cfg.use_uv:
+                pitch_enc_out = self.melody_encoder(
+                    x["frame_pitch"], uv=x["frame_uv"], length=x["target_len"]
+                )
+            else:
+                pitch_enc_out = self.melody_encoder(
+                    x["frame_pitch"], uv=None, length=x["target_len"]
+                )
+            outputs.append(pitch_enc_out)
+        if self.cfg.use_energy:
+            loudness_enc_out = self.loudness_encoder(x["frame_energy"])
+            outputs.append(loudness_enc_out)
+        if self.cfg.use_whisper:
+            # whisper_feat: [b, T, 1024]
+            whiser_enc_out = self.whisper_encoder(
+                x["whisper_feat"], length=x["target_len"]
+            )
+            outputs.append(whiser_enc_out)
+            seq_len = whiser_enc_out.shape[1]
+        if self.cfg.use_contentvec:
+            contentvec_enc_out = self.contentvec_encoder(
+                x["contentvec_feat"], length=x["target_len"]
+            )
+            outputs.append(contentvec_enc_out)
+            seq_len = contentvec_enc_out.shape[1]
+        if self.cfg.use_mert:
+            mert_enc_out = self.mert_encoder(x["mert_feat"], length=x["target_len"])
+            outputs.append(mert_enc_out)
+            seq_len = mert_enc_out.shape[1]
+        if self.cfg.use_wenet:
+            wenet_enc_out = self.wenet_encoder(x["wenet_feat"], length=x["target_len"])
+            outputs.append(wenet_enc_out)
+            seq_len = wenet_enc_out.shape[1]
+        if self.cfg.use_spkid:
+            speaker_enc_out = self.singer_encoder(x["spk_id"])  # [b, 1, 384]
+            assert (
+                "whisper_feat" in x.keys()
+                or "contentvec_feat" in x.keys()
+                or "mert_feat" in x.keys()
+                or "wenet_feat" in x.keys()
+            )
+            singer_info = speaker_enc_out.expand(-1, seq_len, -1)
+            outputs.append(singer_info)
+        encoder_output = None
+        if self.merge_mode == "concat":
+            encoder_output = torch.cat(outputs, dim=-1)
+        if self.merge_mode == "add":
+            # (#modules, N, seq_len, output_dim)
+            outputs = torch.cat([out[None, :, :, :] for out in outputs], dim=0)
+            # (N, seq_len, output_dim)
+            encoder_output = torch.sum(outputs, dim=0)
+        return encoder_output

Amphion/modules/general/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from .input_strategies import PromptedFeatures, PromptedPrecomputedFeatures
+from .scaling import BalancedDoubleSwish
+from .utils import Transpose

Amphion/modules/monotonic_align/__init__.py ADDED Viewed

	@@ -0,0 +1,21 @@

+# This code from https://github.com/jaywalnut310/vits/
+import numpy as np
+import torch
+from .monotonic_align.core import maximum_path_c
+def maximum_path(neg_cent, mask):
+    """Cython optimized version.
+    neg_cent: [b, t_t, t_s]
+    mask: [b, t_t, t_s]
+    """
+    device = neg_cent.device
+    dtype = neg_cent.dtype
+    neg_cent = neg_cent.data.cpu().numpy().astype(np.float32)
+    path = np.zeros(neg_cent.shape, dtype=np.int32)
+    t_t_max = mask.sum(1)[:, 0].data.cpu().numpy().astype(np.int32)
+    t_s_max = mask.sum(2)[:, 0].data.cpu().numpy().astype(np.int32)
+    maximum_path_c(path, neg_cent, t_t_max, t_s_max)
+    return torch.from_numpy(path).to(device=device, dtype=dtype)

Amphion/modules/neural_source_filter/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from .sine_excitation import *

Amphion/modules/transformer/Layers.py ADDED Viewed

	@@ -0,0 +1,137 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from .SubLayers import MultiHeadAttention, PositionwiseFeedForward
+class FFTBlock(torch.nn.Module):
+    """FFT Block"""
+    def __init__(self, d_model, n_head, d_k, d_v, d_inner, kernel_size, dropout=0.1):
+        super(FFTBlock, self).__init__()
+        self.slf_attn = MultiHeadAttention(n_head, d_model, d_k, d_v, dropout=dropout)
+        self.pos_ffn = PositionwiseFeedForward(
+            d_model, d_inner, kernel_size, dropout=dropout
+        )
+    def forward(self, enc_input, mask=None, slf_attn_mask=None):
+        enc_output, enc_slf_attn = self.slf_attn(
+            enc_input, enc_input, enc_input, mask=slf_attn_mask
+        )
+        enc_output = enc_output.masked_fill(mask.unsqueeze(-1), 0)
+        enc_output = self.pos_ffn(enc_output)
+        enc_output = enc_output.masked_fill(mask.unsqueeze(-1), 0)
+        return enc_output, enc_slf_attn
+class ConvNorm(torch.nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size=1,
+        stride=1,
+        padding=None,
+        dilation=1,
+        bias=True,
+        w_init_gain="linear",
+    ):
+        super(ConvNorm, self).__init__()
+        if padding is None:
+            assert kernel_size % 2 == 1
+            padding = int(dilation * (kernel_size - 1) / 2)
+        self.conv = torch.nn.Conv1d(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            bias=bias,
+        )
+    def forward(self, signal):
+        conv_signal = self.conv(signal)
+        return conv_signal
+class PostNet(nn.Module):
+    """
+    PostNet: Five 1-d convolution with 512 channels and kernel size 5
+    """
+    def __init__(
+        self,
+        n_mel_channels=80,
+        postnet_embedding_dim=512,
+        postnet_kernel_size=5,
+        postnet_n_convolutions=5,
+    ):
+        super(PostNet, self).__init__()
+        self.convolutions = nn.ModuleList()
+        self.convolutions.append(
+            nn.Sequential(
+                ConvNorm(
+                    n_mel_channels,
+                    postnet_embedding_dim,
+                    kernel_size=postnet_kernel_size,
+                    stride=1,
+                    padding=int((postnet_kernel_size - 1) / 2),
+                    dilation=1,
+                    w_init_gain="tanh",
+                ),
+                nn.BatchNorm1d(postnet_embedding_dim),
+            )
+        )
+        for i in range(1, postnet_n_convolutions - 1):
+            self.convolutions.append(
+                nn.Sequential(
+                    ConvNorm(
+                        postnet_embedding_dim,
+                        postnet_embedding_dim,
+                        kernel_size=postnet_kernel_size,
+                        stride=1,
+                        padding=int((postnet_kernel_size - 1) / 2),
+                        dilation=1,
+                        w_init_gain="tanh",
+                    ),
+                    nn.BatchNorm1d(postnet_embedding_dim),
+                )
+            )
+        self.convolutions.append(
+            nn.Sequential(
+                ConvNorm(
+                    postnet_embedding_dim,
+                    n_mel_channels,
+                    kernel_size=postnet_kernel_size,
+                    stride=1,
+                    padding=int((postnet_kernel_size - 1) / 2),
+                    dilation=1,
+                    w_init_gain="linear",
+                ),
+                nn.BatchNorm1d(n_mel_channels),
+            )
+        )
+    def forward(self, x):
+        x = x.contiguous().transpose(1, 2)
+        for i in range(len(self.convolutions) - 1):
+            x = F.dropout(torch.tanh(self.convolutions[i](x)), 0.5, self.training)
+        x = F.dropout(self.convolutions[-1](x), 0.5, self.training)
+        x = x.contiguous().transpose(1, 2)
+        return x

Amphion/modules/wenet_extractor/cif/predictor.py ADDED Viewed

	@@ -0,0 +1,274 @@

+# This module is from [WeNet](https://github.com/wenet-e2e/wenet).
+# ## Citations
+# ```bibtex
+# @inproceedings{yao2021wenet,
+#   title={WeNet: Production oriented Streaming and Non-streaming End-to-End Speech Recognition Toolkit},
+#   author={Yao, Zhuoyuan and Wu, Di and Wang, Xiong and Zhang, Binbin and Yu, Fan and Yang, Chao and Peng, Zhendong and Chen, Xiaoyu and Xie, Lei and Lei, Xin},
+#   booktitle={Proc. Interspeech},
+#   year={2021},
+#   address={Brno, Czech Republic },
+#   organization={IEEE}
+# }
+# @article{zhang2022wenet,
+#   title={WeNet 2.0: More Productive End-to-End Speech Recognition Toolkit},
+#   author={Zhang, Binbin and Wu, Di and Peng, Zhendong and Song, Xingchen and Yao, Zhuoyuan and Lv, Hang and Xie, Lei and Yang, Chao and Pan, Fuping and Niu, Jianwei},
+#   journal={arXiv preprint arXiv:2203.15455},
+#   year={2022}
+# }
+#
+from typing import Optional
+import torch
+from torch import nn
+from modules.wenet_extractor.utils.mask import make_pad_mask
+class Predictor(nn.Module):
+    def __init__(
+        self,
+        idim,
+        l_order,
+        r_order,
+        threshold=1.0,
+        dropout=0.1,
+        smooth_factor=1.0,
+        noise_threshold=0,
+        tail_threshold=0.45,
+    ):
+        super().__init__()
+        self.pad = nn.ConstantPad1d((l_order, r_order), 0.0)
+        self.cif_conv1d = nn.Conv1d(idim, idim, l_order + r_order + 1, groups=idim)
+        self.cif_output = nn.Linear(idim, 1)
+        self.dropout = torch.nn.Dropout(p=dropout)
+        self.threshold = threshold
+        self.smooth_factor = smooth_factor
+        self.noise_threshold = noise_threshold
+        self.tail_threshold = tail_threshold
+    def forward(
+        self,
+        hidden,
+        target_label: Optional[torch.Tensor] = None,
+        mask: torch.Tensor = torch.tensor(0),
+        ignore_id: int = -1,
+        mask_chunk_predictor: Optional[torch.Tensor] = None,
+        target_label_length: Optional[torch.Tensor] = None,
+    ):
+        h = hidden
+        context = h.transpose(1, 2)
+        queries = self.pad(context)
+        memory = self.cif_conv1d(queries)
+        output = memory + context
+        output = self.dropout(output)
+        output = output.transpose(1, 2)
+        output = torch.relu(output)
+        output = self.cif_output(output)
+        alphas = torch.sigmoid(output)
+        alphas = torch.nn.functional.relu(
+            alphas * self.smooth_factor - self.noise_threshold
+        )
+        if mask is not None:
+            mask = mask.transpose(-1, -2).float()
+            alphas = alphas * mask
+        if mask_chunk_predictor is not None:
+            alphas = alphas * mask_chunk_predictor
+        alphas = alphas.squeeze(-1)
+        mask = mask.squeeze(-1)
+        if target_label_length is not None:
+            target_length = target_label_length
+        elif target_label is not None:
+            target_length = (target_label != ignore_id).float().sum(-1)
+        else:
+            target_length = None
+        token_num = alphas.sum(-1)
+        if target_length is not None:
+            alphas *= (target_length / token_num)[:, None].repeat(1, alphas.size(1))
+        elif self.tail_threshold > 0.0:
+            hidden, alphas, token_num = self.tail_process_fn(
+                hidden, alphas, token_num, mask=mask
+            )
+        acoustic_embeds, cif_peak = cif(hidden, alphas, self.threshold)
+        if target_length is None and self.tail_threshold > 0.0:
+            token_num_int = torch.max(token_num).type(torch.int32).item()
+            acoustic_embeds = acoustic_embeds[:, :token_num_int, :]
+        return acoustic_embeds, token_num, alphas, cif_peak
+    def tail_process_fn(
+        self,
+        hidden,
+        alphas,
+        token_num: Optional[torch.Tensor] = None,
+        mask: Optional[torch.Tensor] = None,
+    ):
+        b, t, d = hidden.size()
+        tail_threshold = self.tail_threshold
+        if mask is not None:
+            zeros_t = torch.zeros((b, 1), dtype=torch.float32, device=alphas.device)
+            ones_t = torch.ones_like(zeros_t)
+            mask_1 = torch.cat([mask, zeros_t], dim=1)
+            mask_2 = torch.cat([ones_t, mask], dim=1)
+            mask = mask_2 - mask_1
+            tail_threshold = mask * tail_threshold
+            alphas = torch.cat([alphas, zeros_t], dim=1)
+            alphas = torch.add(alphas, tail_threshold)
+        else:
+            tail_threshold_tensor = torch.tensor(
+                [tail_threshold], dtype=alphas.dtype
+            ).to(alphas.device)
+            tail_threshold_tensor = torch.reshape(tail_threshold_tensor, (1, 1))
+            alphas = torch.cat([alphas, tail_threshold_tensor], dim=1)
+        zeros = torch.zeros((b, 1, d), dtype=hidden.dtype).to(hidden.device)
+        hidden = torch.cat([hidden, zeros], dim=1)
+        token_num = alphas.sum(dim=-1)
+        token_num_floor = torch.floor(token_num)
+        return hidden, alphas, token_num_floor
+    def gen_frame_alignments(
+        self, alphas: torch.Tensor = None, encoder_sequence_length: torch.Tensor = None
+    ):
+        batch_size, maximum_length = alphas.size()
+        int_type = torch.int32
+        is_training = self.training
+        if is_training:
+            token_num = torch.round(torch.sum(alphas, dim=1)).type(int_type)
+        else:
+            token_num = torch.floor(torch.sum(alphas, dim=1)).type(int_type)
+        max_token_num = torch.max(token_num).item()
+        alphas_cumsum = torch.cumsum(alphas, dim=1)
+        alphas_cumsum = torch.floor(alphas_cumsum).type(int_type)
+        alphas_cumsum = alphas_cumsum[:, None, :].repeat(1, max_token_num, 1)
+        index = torch.ones([batch_size, max_token_num], dtype=int_type)
+        index = torch.cumsum(index, dim=1)
+        index = index[:, :, None].repeat(1, 1, maximum_length).to(alphas_cumsum.device)
+        index_div = torch.floor(torch.true_divide(alphas_cumsum, index)).type(int_type)
+        index_div_bool_zeros = index_div.eq(0)
+        index_div_bool_zeros_count = torch.sum(index_div_bool_zeros, dim=-1) + 1
+        index_div_bool_zeros_count = torch.clamp(
+            index_div_bool_zeros_count, 0, encoder_sequence_length.max()
+        )
+        token_num_mask = (~make_pad_mask(token_num, max_len=max_token_num)).to(
+            token_num.device
+        )
+        index_div_bool_zeros_count *= token_num_mask
+        index_div_bool_zeros_count_tile = index_div_bool_zeros_count[:, :, None].repeat(
+            1, 1, maximum_length
+        )
+        ones = torch.ones_like(index_div_bool_zeros_count_tile)
+        zeros = torch.zeros_like(index_div_bool_zeros_count_tile)
+        ones = torch.cumsum(ones, dim=2)
+        cond = index_div_bool_zeros_count_tile == ones
+        index_div_bool_zeros_count_tile = torch.where(cond, zeros, ones)
+        index_div_bool_zeros_count_tile_bool = index_div_bool_zeros_count_tile.type(
+            torch.bool
+        )
+        index_div_bool_zeros_count_tile = 1 - index_div_bool_zeros_count_tile_bool.type(
+            int_type
+        )
+        index_div_bool_zeros_count_tile_out = torch.sum(
+            index_div_bool_zeros_count_tile, dim=1
+        )
+        index_div_bool_zeros_count_tile_out = index_div_bool_zeros_count_tile_out.type(
+            int_type
+        )
+        predictor_mask = (
+            (
+                ~make_pad_mask(
+                    encoder_sequence_length, max_len=encoder_sequence_length.max()
+                )
+            )
+            .type(int_type)
+            .to(encoder_sequence_length.device)
+        )
+        index_div_bool_zeros_count_tile_out = (
+            index_div_bool_zeros_count_tile_out * predictor_mask
+        )
+        predictor_alignments = index_div_bool_zeros_count_tile_out
+        predictor_alignments_length = predictor_alignments.sum(-1).type(
+            encoder_sequence_length.dtype
+        )
+        return predictor_alignments.detach(), predictor_alignments_length.detach()
+class MAELoss(nn.Module):
+    def __init__(self, normalize_length=False):
+        super(MAELoss, self).__init__()
+        self.normalize_length = normalize_length
+        self.criterion = torch.nn.L1Loss(reduction="sum")
+    def forward(self, token_length, pre_token_length):
+        loss_token_normalizer = token_length.size(0)
+        if self.normalize_length:
+            loss_token_normalizer = token_length.sum().type(torch.float32)
+        loss = self.criterion(token_length, pre_token_length)
+        loss = loss / loss_token_normalizer
+        return loss
+def cif(hidden: torch.Tensor, alphas: torch.Tensor, threshold: float):
+    batch_size, len_time, hidden_size = hidden.size()
+    # loop varss
+    integrate = torch.zeros([batch_size], device=hidden.device)
+    frame = torch.zeros([batch_size, hidden_size], device=hidden.device)
+    # intermediate vars along time
+    list_fires = []
+    list_frames = []
+    for t in range(len_time):
+        alpha = alphas[:, t]
+        distribution_completion = (
+            torch.ones([batch_size], device=hidden.device) - integrate
+        )
+        integrate += alpha
+        list_fires.append(integrate)
+        fire_place = integrate >= threshold
+        integrate = torch.where(
+            fire_place,
+            integrate - torch.ones([batch_size], device=hidden.device),
+            integrate,
+        )
+        cur = torch.where(fire_place, distribution_completion, alpha)
+        remainds = alpha - cur
+        frame += cur[:, None] * hidden[:, t, :]
+        list_frames.append(frame)
+        frame = torch.where(
+            fire_place[:, None].repeat(1, hidden_size),
+            remainds[:, None] * hidden[:, t, :],
+            frame,
+        )
+    fires = torch.stack(list_fires, 1)
+    frames = torch.stack(list_frames, 1)
+    list_ls = []
+    len_labels = torch.round(alphas.sum(-1)).int()
+    max_label_len = len_labels.max()
+    for b in range(batch_size):
+        fire = fires[b, :]
+        l = torch.index_select(
+            frames[b, :, :], 0, torch.nonzero(fire >= threshold).squeeze()
+        )
+        pad_l = torch.zeros(
+            [int(max_label_len - l.size(0)), hidden_size], device=hidden.device
+        )
+        list_ls.append(torch.cat([l, pad_l], 0))
+    return torch.stack(list_ls, 0), fires

Amphion/modules/wenet_extractor/paraformer/search/beam_search.py ADDED Viewed

	@@ -0,0 +1,479 @@

+# This module is from [WeNet](https://github.com/wenet-e2e/wenet).
+# ## Citations
+# ```bibtex
+# @inproceedings{yao2021wenet,
+#   title={WeNet: Production oriented Streaming and Non-streaming End-to-End Speech Recognition Toolkit},
+#   author={Yao, Zhuoyuan and Wu, Di and Wang, Xiong and Zhang, Binbin and Yu, Fan and Yang, Chao and Peng, Zhendong and Chen, Xiaoyu and Xie, Lei and Lei, Xin},
+#   booktitle={Proc. Interspeech},
+#   year={2021},
+#   address={Brno, Czech Republic },
+#   organization={IEEE}
+# }
+# @article{zhang2022wenet,
+#   title={WeNet 2.0: More Productive End-to-End Speech Recognition Toolkit},
+#   author={Zhang, Binbin and Wu, Di and Peng, Zhendong and Song, Xingchen and Yao, Zhuoyuan and Lv, Hang and Xie, Lei and Yang, Chao and Pan, Fuping and Niu, Jianwei},
+#   journal={arXiv preprint arXiv:2203.15455},
+#   year={2022}
+# }
+#
+from itertools import chain
+from typing import Any
+from typing import Dict
+from typing import List
+from typing import Tuple
+from typing import Union
+from typing import NamedTuple
+import torch
+from modules.wenet_extractor.paraformer.utils import end_detect
+from modules.wenet_extractor.paraformer.search.ctc import CTCPrefixScorer
+from modules.wenet_extractor.paraformer.search.scorer_interface import (
+    ScorerInterface,
+    PartialScorerInterface,
+)
+class Hypothesis(NamedTuple):
+    """Hypothesis data type."""
+    yseq: torch.Tensor
+    score: Union[float, torch.Tensor] = 0
+    scores: Dict[str, Union[float, torch.Tensor]] = dict()
+    states: Dict[str, Any] = dict()
+    def asdict(self) -> dict:
+        """Convert data to JSON-friendly dict."""
+        return self._replace(
+            yseq=self.yseq.tolist(),
+            score=float(self.score),
+            scores={k: float(v) for k, v in self.scores.items()},
+        )._asdict()
+class BeamSearchCIF(torch.nn.Module):
+    """Beam search implementation."""
+    def __init__(
+        self,
+        scorers: Dict[str, ScorerInterface],
+        weights: Dict[str, float],
+        beam_size: int,
+        vocab_size: int,
+        sos: int,
+        eos: int,
+        pre_beam_ratio: float = 1.5,
+        pre_beam_score_key: str = None,
+    ):
+        """Initialize beam search.
+        Args:
+            scorers (dict[str, ScorerInterface]): Dict of decoder modules
+                e.g., Decoder, CTCPrefixScorer, LM
+                The scorer will be ignored if it is `None`
+            weights (dict[str, float]): Dict of weights for each scorers
+                The scorer will be ignored if its weight is 0
+            beam_size (int): The number of hypotheses kept during search
+            vocab_size (int): The number of vocabulary
+            sos (int): Start of sequence id
+            eos (int): End of sequence id
+            pre_beam_score_key (str): key of scores to perform pre-beam search
+            pre_beam_ratio (float): beam size in the pre-beam search
+                will be `int(pre_beam_ratio * beam_size)`
+        """
+        super().__init__()
+        # set scorers
+        self.weights = weights
+        self.scorers = dict()
+        self.full_scorers = dict()
+        self.part_scorers = dict()
+        # this module dict is required for recursive cast
+        # `self.to(device, dtype)` in `recog.py`
+        self.nn_dict = torch.nn.ModuleDict()
+        for k, v in scorers.items():
+            w = weights.get(k, 0)
+            if w == 0 or v is None:
+                continue
+            assert isinstance(
+                v, ScorerInterface
+            ), f"{k} ({type(v)}) does not implement ScorerInterface"
+            self.scorers[k] = v
+            if isinstance(v, PartialScorerInterface):
+                self.part_scorers[k] = v
+            else:
+                self.full_scorers[k] = v
+            if isinstance(v, torch.nn.Module):
+                self.nn_dict[k] = v
+        # set configurations
+        self.sos = sos
+        self.eos = eos
+        self.pre_beam_size = int(pre_beam_ratio * beam_size)
+        self.beam_size = beam_size
+        self.n_vocab = vocab_size
+        if (
+            pre_beam_score_key is not None
+            and pre_beam_score_key != "full"
+            and pre_beam_score_key not in self.full_scorers
+        ):
+            raise KeyError(
+                f"{pre_beam_score_key} is not found in " f"{self.full_scorers}"
+            )
+        self.pre_beam_score_key = pre_beam_score_key
+        self.do_pre_beam = (
+            self.pre_beam_score_key is not None
+            and self.pre_beam_size < self.n_vocab
+            and len(self.part_scorers) > 0
+        )
+    def init_hyp(self, x: torch.Tensor) -> List[Hypothesis]:
+        """Get an initial hypothesis data.
+        Args:
+            x (torch.Tensor): The encoder output feature
+        Returns:
+            Hypothesis: The initial hypothesis.
+        """
+        init_states = dict()
+        init_scores = dict()
+        for k, d in self.scorers.items():
+            init_states[k] = d.init_state(x)
+            init_scores[k] = 0.0
+        return [
+            Hypothesis(
+                score=0.0,
+                scores=init_scores,
+                states=init_states,
+                yseq=torch.tensor([self.sos], device=x.device),
+            )
+        ]
+    @staticmethod
+    def append_token(xs: torch.Tensor, x: int) -> torch.Tensor:
+        """Append new token to prefix tokens.
+        Args:
+            xs (torch.Tensor): The prefix token
+            x (int): The new token to append
+        Returns:
+            torch.Tensor: New tensor contains: xs + [x] with xs.dtype and
+            xs.device
+        """
+        x = torch.tensor([x], dtype=xs.dtype, device=xs.device)
+        return torch.cat((xs, x))
+    def score_full(
+        self, hyp: Hypothesis, x: torch.Tensor
+    ) -> Tuple[Dict[str, torch.Tensor], Dict[str, Any]]:
+        """Score new hypothesis by `self.full_scorers`.
+        Args:
+            hyp (Hypothesis): Hypothesis with prefix tokens to score
+            x (torch.Tensor): Corresponding input feature
+        Returns:
+            Tuple[Dict[str, torch.Tensor], Dict[str, Any]]: Tuple of
+                score dict of `hyp` that has string keys of `self.full_scorers`
+                and tensor score values of shape: `(self.n_vocab,)`,
+                and state dict that has string keys
+                and state values of `self.full_scorers`
+        """
+        scores = dict()
+        states = dict()
+        for k, d in self.full_scorers.items():
+            scores[k], states[k] = d.score(hyp.yseq, hyp.states[k], x)
+        return scores, states
+    def score_partial(
+        self, hyp: Hypothesis, ids: torch.Tensor, x: torch.Tensor
+    ) -> Tuple[Dict[str, torch.Tensor], Dict[str, Any]]:
+        """Score new hypothesis by `self.part_scorers`.
+        Args:
+            hyp (Hypothesis): Hypothesis with prefix tokens to score
+            ids (torch.Tensor): 1D tensor of new partial tokens to score
+            x (torch.Tensor): Corresponding input feature
+        Returns:
+            Tuple[Dict[str, torch.Tensor], Dict[str, Any]]: Tuple of
+                score dict of `hyp` that has string keys of `self.part_scorers`
+                and tensor score values of shape: `(len(ids),)`,
+                and state dict that has string keys
+                and state values of `self.part_scorers`
+        """
+        scores = dict()
+        states = dict()
+        for k, d in self.part_scorers.items():
+            scores[k], states[k] = d.score_partial(hyp.yseq, ids, hyp.states[k], x)
+        return scores, states
+    def beam(
+        self, weighted_scores: torch.Tensor, ids: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Compute topk full token ids and partial token ids.
+        Args:
+            weighted_scores (torch.Tensor): The weighted sum scores for each
+            tokens.
+            Its shape is `(self.n_vocab,)`.
+            ids (torch.Tensor): The partial token ids to compute topk
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor]:
+                The topk full token ids and partial token ids.
+                Their shapes are `(self.beam_size,)`
+        """
+        # no pre beam performed
+        if weighted_scores.size(0) == ids.size(0):
+            top_ids = weighted_scores.topk(self.beam_size)[1]
+            return top_ids, top_ids
+        # mask pruned in pre-beam not to select in topk
+        tmp = weighted_scores[ids]
+        weighted_scores[:] = -float("inf")
+        weighted_scores[ids] = tmp
+        top_ids = weighted_scores.topk(self.beam_size)[1]
+        local_ids = weighted_scores[ids].topk(self.beam_size)[1]
+        return top_ids, local_ids
+    @staticmethod
+    def merge_scores(
+        prev_scores: Dict[str, float],
+        next_full_scores: Dict[str, torch.Tensor],
+        full_idx: int,
+        next_part_scores: Dict[str, torch.Tensor],
+        part_idx: int,
+    ) -> Dict[str, torch.Tensor]:
+        """Merge scores for new hypothesis.
+        Args:
+            prev_scores (Dict[str, float]):
+                The previous hypothesis scores by `self.scorers`
+            next_full_scores (Dict[str, torch.Tensor]): scores by
+            `self.full_scorers`
+            full_idx (int): The next token id for `next_full_scores`
+            next_part_scores (Dict[str, torch.Tensor]):
+                scores of partial tokens by `self.part_scorers`
+            part_idx (int): The new token id for `next_part_scores`
+        Returns:
+            Dict[str, torch.Tensor]: The new score dict.
+                Its keys are names of `self.full_scorers` and
+                `self.part_scorers`.
+                Its values are scalar tensors by the scorers.
+        """
+        new_scores = dict()
+        for k, v in next_full_scores.items():
+            new_scores[k] = prev_scores[k] + v[full_idx]
+        for k, v in next_part_scores.items():
+            new_scores[k] = prev_scores[k] + v[part_idx]
+        return new_scores
+    def merge_states(self, states: Any, part_states: Any, part_idx: int) -> Any:
+        """Merge states for new hypothesis.
+        Args:
+            states: states of `self.full_scorers`
+            part_states: states of `self.part_scorers`
+            part_idx (int): The new token id for `part_scores`
+        Returns:
+            Dict[str, torch.Tensor]: The new score dict.
+                Its keys are names of `self.full_scorers` and
+                `self.part_scorers`.
+                Its values are states of the scorers.
+        """
+        new_states = dict()
+        for k, v in states.items():
+            new_states[k] = v
+        for k, d in self.part_scorers.items():
+            new_states[k] = d.select_state(part_states[k], part_idx)
+        return new_states
+    def search(
+        self, running_hyps: List[Hypothesis], x: torch.Tensor, am_score: torch.Tensor
+    ) -> List[Hypothesis]:
+        """Search new tokens for running hypotheses and encoded speech x.
+        Args:
+            running_hyps (List[Hypothesis]): Running hypotheses on beam
+            x (torch.Tensor): Encoded speech feature (T, D)
+        Returns:
+            List[Hypotheses]: Best sorted hypotheses
+        """
+        best_hyps = []
+        part_ids = torch.arange(self.n_vocab, device=x.device)  # no pre-beam
+        for hyp in running_hyps:
+            # scoring
+            weighted_scores = torch.zeros(self.n_vocab, dtype=x.dtype, device=x.device)
+            weighted_scores += am_score
+            scores, states = self.score_full(hyp, x)
+            for k in self.full_scorers:
+                weighted_scores += self.weights[k] * scores[k]
+            # partial scoring
+            if self.do_pre_beam:
+                pre_beam_scores = (
+                    weighted_scores
+                    if self.pre_beam_score_key == "full"
+                    else scores[self.pre_beam_score_key]
+                )
+                part_ids = torch.topk(pre_beam_scores, self.pre_beam_size)[1]
+            part_scores, part_states = self.score_partial(hyp, part_ids, x)
+            for k in self.part_scorers:
+                weighted_scores[part_ids] += self.weights[k] * part_scores[k]
+            # add previous hyp score
+            weighted_scores += hyp.score
+            # update hyps
+            for j, part_j in zip(*self.beam(weighted_scores, part_ids)):
+                # will be (2 x beam at most)
+                best_hyps.append(
+                    Hypothesis(
+                        score=weighted_scores[j],
+                        yseq=self.append_token(hyp.yseq, j),
+                        scores=self.merge_scores(
+                            hyp.scores, scores, j, part_scores, part_j
+                        ),
+                        states=self.merge_states(states, part_states, part_j),
+                    )
+                )
+            # sort and prune 2 x beam -> beam
+            best_hyps = sorted(best_hyps, key=lambda x: x.score, reverse=True)[
+                : min(len(best_hyps), self.beam_size)
+            ]
+        return best_hyps
+    def forward(
+        self,
+        x: torch.Tensor,
+        am_scores: torch.Tensor,
+        maxlenratio: float = 0.0,
+        minlenratio: float = 0.0,
+    ) -> List[Hypothesis]:
+        """Perform beam search.
+        Args:
+            x (torch.Tensor): Encoded speech feature (T, D)
+            maxlenratio (float): Input length ratio to obtain max output length.
+                If maxlenratio=0.0 (default), it uses a end-detect function
+                to automatically find maximum hypothesis lengths
+                If maxlenratio<0.0, its absolute value is interpreted
+                as a constant max output length.
+            minlenratio (float): Input length ratio to obtain min output length.
+        Returns:
+            list[Hypothesis]: N-best decoding results
+        """
+        # set length bounds
+        maxlen = am_scores.shape[0]
+        # main loop of prefix search
+        running_hyps = self.init_hyp(x)
+        ended_hyps = []
+        for i in range(maxlen):
+            best = self.search(running_hyps, x, am_scores[i])
+            # post process of one iteration
+            running_hyps = self.post_process(i, maxlen, maxlenratio, best, ended_hyps)
+            # end detection
+            if maxlenratio == 0.0 and end_detect([h.asdict() for h in ended_hyps], i):
+                break
+        nbest_hyps = sorted(ended_hyps, key=lambda x: x.score, reverse=True)
+        # check the number of hypotheses reaching to eos
+        if len(nbest_hyps) == 0:
+            return (
+                []
+                if minlenratio < 0.1
+                else self.forward(x, maxlenratio, max(0.0, minlenratio - 0.1))
+            )
+        best = nbest_hyps[0]
+        return nbest_hyps
+    def post_process(
+        self,
+        i: int,
+        maxlen: int,
+        maxlenratio: float,
+        running_hyps: List[Hypothesis],
+        ended_hyps: List[Hypothesis],
+    ) -> List[Hypothesis]:
+        """Perform post-processing of beam search iterations.
+        Args:
+            i (int): The length of hypothesis tokens.
+            maxlen (int): The maximum length of tokens in beam search.
+            maxlenratio (int): The maximum length ratio in beam search.
+            running_hyps (List[Hypothesis]): The running hypotheses in beam
+                search.
+            ended_hyps (List[Hypothesis]): The ended hypotheses in beam search.
+        Returns:
+            List[Hypothesis]: The new running hypotheses.
+        """
+        # add eos in the final loop to avoid that there are no ended hyps
+        if i == maxlen - 1:
+            # logging.info("adding <eos> in the last position in the loop")
+            running_hyps = [
+                h._replace(yseq=self.append_token(h.yseq, self.eos))
+                for h in running_hyps
+            ]
+        # add ended hypotheses to a final list, and removed them from current
+        # hypotheses
+        # (this will be a problem, number of hyps < beam)
+        remained_hyps = []
+        for hyp in running_hyps:
+            if hyp.yseq[-1] == self.eos:
+                # e.g., Word LM needs to add final <eos> score
+                for k, d in chain(self.full_scorers.items(), self.part_scorers.items()):
+                    s = d.final_score(hyp.states[k])
+                    hyp.scores[k] += s
+                    hyp = hyp._replace(score=hyp.score + self.weights[k] * s)
+                ended_hyps.append(hyp)
+            else:
+                remained_hyps.append(hyp)
+        return remained_hyps
+def build_beam_search(model, args, device):
+    scorers = {}
+    if model.ctc is not None:
+        ctc = CTCPrefixScorer(ctc=model.ctc, eos=model.eos)
+        scorers.update(ctc=ctc)
+    weights = dict(
+        decoder=1.0 - args.ctc_weight,
+        ctc=args.ctc_weight,
+        length_bonus=args.penalty,
+    )
+    beam_search = BeamSearchCIF(
+        beam_size=args.beam_size,
+        weights=weights,
+        scorers=scorers,
+        sos=model.sos,
+        eos=model.eos,
+        vocab_size=model.vocab_size,
+        pre_beam_score_key=None if args.ctc_weight == 1.0 else "full",
+    )
+    beam_search.to(device=device, dtype=torch.float32).eval()
+    return beam_search

Amphion/modules/wenet_extractor/paraformer/search/ctc_prefix_score.py ADDED Viewed

	@@ -0,0 +1,377 @@

+# This module is from [WeNet](https://github.com/wenet-e2e/wenet).
+# ## Citations
+# ```bibtex
+# @inproceedings{yao2021wenet,
+#   title={WeNet: Production oriented Streaming and Non-streaming End-to-End Speech Recognition Toolkit},
+#   author={Yao, Zhuoyuan and Wu, Di and Wang, Xiong and Zhang, Binbin and Yu, Fan and Yang, Chao and Peng, Zhendong and Chen, Xiaoyu and Xie, Lei and Lei, Xin},
+#   booktitle={Proc. Interspeech},
+#   year={2021},
+#   address={Brno, Czech Republic },
+#   organization={IEEE}
+# }
+# @article{zhang2022wenet,
+#   title={WeNet 2.0: More Productive End-to-End Speech Recognition Toolkit},
+#   author={Zhang, Binbin and Wu, Di and Peng, Zhendong and Song, Xingchen and Yao, Zhuoyuan and Lv, Hang and Xie, Lei and Yang, Chao and Pan, Fuping and Niu, Jianwei},
+#   journal={arXiv preprint arXiv:2203.15455},
+#   year={2022}
+# }
+#
+import torch
+import numpy as np
+import six
+class CTCPrefixScore(object):
+    """Compute CTC label sequence scores
+    which is based on Algorithm 2 in WATANABE et al.
+    "HYBRID CTC/ATTENTION ARCHITECTURE FOR END-TO-END SPEECH RECOGNITION,"
+    but extended to efficiently compute the probablities of multiple labels
+    simultaneously
+    """
+    def __init__(self, x, blank, eos, xp):
+        self.xp = xp
+        self.logzero = -10000000000.0
+        self.blank = blank
+        self.eos = eos
+        self.input_length = len(x)
+        self.x = x
+    def initial_state(self):
+        """Obtain an initial CTC state
+        :return: CTC state
+        """
+        # initial CTC state is made of a frame x 2 tensor that corresponds to
+        # r_t^n(<sos>) and r_t^b(<sos>), where 0 and 1 of axis=1 represent
+        # superscripts n and b (non-blank and blank), respectively.
+        r = self.xp.full((self.input_length, 2), self.logzero, dtype=np.float32)
+        r[0, 1] = self.x[0, self.blank]
+        for i in six.moves.range(1, self.input_length):
+            r[i, 1] = r[i - 1, 1] + self.x[i, self.blank]
+        return r
+    def __call__(self, y, cs, r_prev):
+        """Compute CTC prefix scores for next labels
+        :param y     : prefix label sequence
+        :param cs    : array of next labels
+        :param r_prev: previous CTC state
+        :return ctc_scores, ctc_states
+        """
+        # initialize CTC states
+        output_length = len(y) - 1  # ignore sos
+        # new CTC states are prepared as a frame x (n or b) x n_labels tensor
+        # that corresponds to r_t^n(h) and r_t^b(h).
+        r = self.xp.ndarray((self.input_length, 2, len(cs)), dtype=np.float32)
+        xs = self.x[:, cs]
+        if output_length == 0:
+            r[0, 0] = xs[0]
+            r[0, 1] = self.logzero
+        else:
+            r[output_length - 1] = self.logzero
+        # prepare forward probabilities for the last label
+        r_sum = self.xp.logaddexp(
+            r_prev[:, 0], r_prev[:, 1]
+        )  # log(r_t^n(g) + r_t^b(g))
+        last = y[-1]
+        if output_length > 0 and last in cs:
+            log_phi = self.xp.ndarray((self.input_length, len(cs)), dtype=np.float32)
+            for i in six.moves.range(len(cs)):
+                log_phi[:, i] = r_sum if cs[i] != last else r_prev[:, 1]
+        else:
+            log_phi = r_sum
+        # compute forward probabilities log(r_t^n(h)), log(r_t^b(h)),
+        # and log prefix probabilities log(psi)
+        start = max(output_length, 1)
+        log_psi = r[start - 1, 0]
+        for t in six.moves.range(start, self.input_length):
+            r[t, 0] = self.xp.logaddexp(r[t - 1, 0], log_phi[t - 1]) + xs[t]
+            r[t, 1] = (
+                self.xp.logaddexp(r[t - 1, 0], r[t - 1, 1]) + self.x[t, self.blank]
+            )
+            log_psi = self.xp.logaddexp(log_psi, log_phi[t - 1] + xs[t])
+        # get P(...eos|X) that ends with the prefix itself
+        eos_pos = self.xp.where(cs == self.eos)[0]
+        if len(eos_pos) > 0:
+            log_psi[eos_pos] = r_sum[-1]  # log(r_T^n(g) + r_T^b(g))
+        # exclude blank probs
+        blank_pos = self.xp.where(cs == self.blank)[0]
+        if len(blank_pos) > 0:
+            log_psi[blank_pos] = self.logzero
+        # return the log prefix probability and CTC states, where the label axis
+        # of the CTC states is moved to the first axis to slice it easily
+        return log_psi, self.xp.rollaxis(r, 2)
+class CTCPrefixScoreTH(object):
+    """Batch processing of CTCPrefixScore
+    which is based on Algorithm 2 in WATANABE et al.
+    "HYBRID CTC/ATTENTION ARCHITECTURE FOR END-TO-END SPEECH RECOGNITION,"
+    but extended to efficiently compute the label probablities for multiple
+    hypotheses simultaneously
+    See also Seki et al. "Vectorized Beam Search for CTC-Attention-Based
+    Speech Recognition," In INTERSPEECH (pp. 3825-3829), 2019.
+    """
+    def __init__(self, x, xlens, blank, eos, margin=0):
+        """Construct CTC prefix scorer
+        :param torch.Tensor x: input label posterior sequences (B, T, O)
+        :param torch.Tensor xlens: input lengths (B,)
+        :param int blank: blank label id
+        :param int eos: end-of-sequence id
+        :param int margin: margin parameter for windowing (0 means no windowing)
+        """
+        # In the comment lines,
+        # we assume T: input_length, B: batch size, W: beam width, O: output dim
+        self.logzero = -10000000000.0
+        self.blank = blank
+        self.eos = eos
+        self.batch = x.size(0)
+        self.input_length = x.size(1)
+        self.odim = x.size(2)
+        self.dtype = x.dtype
+        self.device = (
+            torch.device("cuda:%d" % x.get_device())
+            if x.is_cuda
+            else torch.device("cpu")
+        )
+        # Pad the rest of posteriors in the batch
+        # TODO(takaaki-hori): need a better way without for-loops
+        for i, l in enumerate(xlens):
+            if l < self.input_length:
+                x[i, l:, :] = self.logzero
+                x[i, l:, blank] = 0
+        # Reshape input x
+        xn = x.transpose(0, 1)  # (B, T, O) -> (T, B, O)
+        xb = xn[:, :, self.blank].unsqueeze(2).expand(-1, -1, self.odim)
+        self.x = torch.stack([xn, xb])  # (2, T, B, O)
+        self.end_frames = torch.as_tensor(xlens) - 1
+        # Setup CTC windowing
+        self.margin = margin
+        if margin > 0:
+            self.frame_ids = torch.arange(
+                self.input_length, dtype=self.dtype, device=self.device
+            )
+        # Base indices for index conversion
+        self.idx_bh = None
+        self.idx_b = torch.arange(self.batch, device=self.device)
+        self.idx_bo = (self.idx_b * self.odim).unsqueeze(1)
+    def __call__(self, y, state, scoring_ids=None, att_w=None):
+        """Compute CTC prefix scores for next labels
+        :param list y: prefix label sequences
+        :param tuple state: previous CTC state
+        :param torch.Tensor pre_scores: scores for pre-selection of hypotheses
+            (BW, O)
+        :param torch.Tensor att_w: attention weights to decide CTC window
+        :return new_state, ctc_local_scores (BW, O)
+        """
+        output_length = len(y[0]) - 1  # ignore sos
+        last_ids = [yi[-1] for yi in y]  # last output label ids
+        n_bh = len(last_ids)  # batch * hyps
+        n_hyps = n_bh // self.batch  # assuming each utterance has the same
+        self.scoring_num = scoring_ids.size(-1) if scoring_ids is not None else 0
+        # prepare state info
+        if state is None:
+            r_prev = torch.full(
+                (self.input_length, 2, self.batch, n_hyps),
+                self.logzero,
+                dtype=self.dtype,
+                device=self.device,
+            )
+            r_prev[:, 1] = torch.cumsum(self.x[0, :, :, self.blank], 0).unsqueeze(2)
+            r_prev = r_prev.view(-1, 2, n_bh)
+            s_prev = 0.0
+            f_min_prev = 0
+            f_max_prev = 1
+        else:
+            r_prev, s_prev, f_min_prev, f_max_prev = state
+        # select input dimensions for scoring
+        if self.scoring_num > 0:
+            scoring_idmap = torch.full(
+                (n_bh, self.odim), -1, dtype=torch.long, device=self.device
+            )
+            snum = self.scoring_num
+            if self.idx_bh is None or n_bh > len(self.idx_bh):
+                self.idx_bh = torch.arange(n_bh, device=self.device).view(-1, 1)
+            scoring_idmap[self.idx_bh[:n_bh], scoring_ids] = torch.arange(
+                snum, device=self.device
+            )
+            scoring_idx = (
+                scoring_ids + self.idx_bo.repeat(1, n_hyps).view(-1, 1)
+            ).view(-1)
+            x_ = torch.index_select(
+                self.x.view(2, -1, self.batch * self.odim), 2, scoring_idx
+            ).view(2, -1, n_bh, snum)
+        else:
+            scoring_ids = None
+            scoring_idmap = None
+            snum = self.odim
+            x_ = self.x.unsqueeze(3).repeat(1, 1, 1, n_hyps, 1).view(2, -1, n_bh, snum)
+        # new CTC forward probs are prepared as a (T x 2 x BW x S) tensor
+        # that corresponds to r_t^n(h) and r_t^b(h) in a batch.
+        r = torch.full(
+            (self.input_length, 2, n_bh, snum),
+            self.logzero,
+            dtype=self.dtype,
+            device=self.device,
+        )
+        if output_length == 0:
+            r[0, 0] = x_[0, 0]
+        r_sum = torch.logsumexp(r_prev, 1)
+        log_phi = r_sum.unsqueeze(2).repeat(1, 1, snum)
+        if scoring_ids is not None:
+            for idx in range(n_bh):
+                pos = scoring_idmap[idx, last_ids[idx]]
+                if pos >= 0:
+                    log_phi[:, idx, pos] = r_prev[:, 1, idx]
+        else:
+            for idx in range(n_bh):
+                log_phi[:, idx, last_ids[idx]] = r_prev[:, 1, idx]
+        # decide start and end frames based on attention weights
+        if att_w is not None and self.margin > 0:
+            f_arg = torch.matmul(att_w, self.frame_ids)
+            f_min = max(int(f_arg.min().cpu()), f_min_prev)
+            f_max = max(int(f_arg.max().cpu()), f_max_prev)
+            start = min(f_max_prev, max(f_min - self.margin, output_length, 1))
+            end = min(f_max + self.margin, self.input_length)
+        else:
+            f_min = f_max = 0
+            start = max(output_length, 1)
+            end = self.input_length
+        # compute forward probabilities log(r_t^n(h)) and log(r_t^b(h))
+        for t in range(start, end):
+            rp = r[t - 1]
+            rr = torch.stack([rp[0], log_phi[t - 1], rp[0], rp[1]]).view(
+                2, 2, n_bh, snum
+            )
+            r[t] = torch.logsumexp(rr, 1) + x_[:, t]
+        # compute log prefix probabilities log(psi)
+        log_phi_x = torch.cat((log_phi[0].unsqueeze(0), log_phi[:-1]), dim=0) + x_[0]
+        if scoring_ids is not None:
+            log_psi = torch.full(
+                (n_bh, self.odim), self.logzero, dtype=self.dtype, device=self.device
+            )
+            log_psi_ = torch.logsumexp(
+                torch.cat((log_phi_x[start:end], r[start - 1, 0].unsqueeze(0)), dim=0),
+                dim=0,
+            )
+            for si in range(n_bh):
+                log_psi[si, scoring_ids[si]] = log_psi_[si]
+        else:
+            log_psi = torch.logsumexp(
+                torch.cat((log_phi_x[start:end], r[start - 1, 0].unsqueeze(0)), dim=0),
+                dim=0,
+            )
+        for si in range(n_bh):
+            log_psi[si, self.eos] = r_sum[self.end_frames[si // n_hyps], si]
+        # exclude blank probs
+        log_psi[:, self.blank] = self.logzero
+        return (log_psi - s_prev), (r, log_psi, f_min, f_max, scoring_idmap)
+    def index_select_state(self, state, best_ids):
+        """Select CTC states according to best ids
+        :param state    : CTC state
+        :param best_ids : index numbers selected by beam pruning (B, W)
+        :return selected_state
+        """
+        r, s, f_min, f_max, scoring_idmap = state
+        # convert ids to BHO space
+        n_bh = len(s)
+        n_hyps = n_bh // self.batch
+        vidx = (best_ids + (self.idx_b * (n_hyps * self.odim)).view(-1, 1)).view(-1)
+        # select hypothesis scores
+        s_new = torch.index_select(s.view(-1), 0, vidx)
+        s_new = s_new.view(-1, 1).repeat(1, self.odim).view(n_bh, self.odim)
+        # convert ids to BHS space (S: scoring_num)
+        if scoring_idmap is not None:
+            snum = self.scoring_num
+            hyp_idx = (best_ids // self.odim + (self.idx_b * n_hyps).view(-1, 1)).view(
+                -1
+            )
+            label_ids = torch.fmod(best_ids, self.odim).view(-1)
+            score_idx = scoring_idmap[hyp_idx, label_ids]
+            score_idx[score_idx == -1] = 0
+            vidx = score_idx + hyp_idx * snum
+        else:
+            snum = self.odim
+        # select forward probabilities
+        r_new = torch.index_select(r.view(-1, 2, n_bh * snum), 2, vidx).view(
+            -1, 2, n_bh
+        )
+        return r_new, s_new, f_min, f_max
+    def extend_prob(self, x):
+        """Extend CTC prob.
+        :param torch.Tensor x: input label posterior sequences (B, T, O)
+        """
+        if self.x.shape[1] < x.shape[1]:  # self.x (2,T,B,O); x (B,T,O)
+            # Pad the rest of posteriors in the batch
+            # TODO(takaaki-hori): need a better way without for-loops
+            xlens = [x.size(1)]
+            for i, l in enumerate(xlens):
+                if l < self.input_length:
+                    x[i, l:, :] = self.logzero
+                    x[i, l:, self.blank] = 0
+            tmp_x = self.x
+            xn = x.transpose(0, 1)  # (B, T, O) -> (T, B, O)
+            xb = xn[:, :, self.blank].unsqueeze(2).expand(-1, -1, self.odim)
+            self.x = torch.stack([xn, xb])  # (2, T, B, O)
+            self.x[:, : tmp_x.shape[1], :, :] = tmp_x
+            self.input_length = x.size(1)
+            self.end_frames = torch.as_tensor(xlens) - 1
+    def extend_state(self, state):
+        """Compute CTC prefix state.
+        :param state    : CTC state
+        :return ctc_state
+        """
+        if state is None:
+            # nothing to do
+            return state
+        else:
+            r_prev, s_prev, f_min_prev, f_max_prev = state
+            r_prev_new = torch.full(
+                (self.input_length, 2),
+                self.logzero,
+                dtype=self.dtype,
+                device=self.device,
+            )
+            start = max(r_prev.shape[0], 1)
+            r_prev_new[0:start] = r_prev
+            for t in six.moves.range(start, self.input_length):
+                r_prev_new[t, 1] = r_prev_new[t - 1, 1] + self.x[0, t, :, self.blank]
+            return r_prev_new, s_prev, f_min_prev, f_max_prev

Amphion/modules/wenet_extractor/squeezeformer/positionwise_feed_forward.py ADDED Viewed

	@@ -0,0 +1,88 @@

+# This module is from [WeNet](https://github.com/wenet-e2e/wenet).
+# ## Citations
+# ```bibtex
+# @inproceedings{yao2021wenet,
+#   title={WeNet: Production oriented Streaming and Non-streaming End-to-End Speech Recognition Toolkit},
+#   author={Yao, Zhuoyuan and Wu, Di and Wang, Xiong and Zhang, Binbin and Yu, Fan and Yang, Chao and Peng, Zhendong and Chen, Xiaoyu and Xie, Lei and Lei, Xin},
+#   booktitle={Proc. Interspeech},
+#   year={2021},
+#   address={Brno, Czech Republic },
+#   organization={IEEE}
+# }
+# @article{zhang2022wenet,
+#   title={WeNet 2.0: More Productive End-to-End Speech Recognition Toolkit},
+#   author={Zhang, Binbin and Wu, Di and Peng, Zhendong and Song, Xingchen and Yao, Zhuoyuan and Lv, Hang and Xie, Lei and Yang, Chao and Pan, Fuping and Niu, Jianwei},
+#   journal={arXiv preprint arXiv:2203.15455},
+#   year={2022}
+# }
+#
+"""Positionwise feed forward layer definition."""
+import torch
+class PositionwiseFeedForward(torch.nn.Module):
+    """Positionwise feed forward layer.
+    FeedForward are appied on each position of the sequence.
+    The output dim is same with the input dim.
+    Args:
+        idim (int): Input dimenstion.
+        hidden_units (int): The number of hidden units.
+        dropout_rate (float): Dropout rate.
+        activation (torch.nn.Module): Activation function
+    """
+    def __init__(
+        self,
+        idim: int,
+        hidden_units: int,
+        dropout_rate: float,
+        activation: torch.nn.Module = torch.nn.ReLU(),
+        adaptive_scale: bool = False,
+        init_weights: bool = False,
+    ):
+        """Construct a PositionwiseFeedForward object."""
+        super(PositionwiseFeedForward, self).__init__()
+        self.idim = idim
+        self.hidden_units = hidden_units
+        self.w_1 = torch.nn.Linear(idim, hidden_units)
+        self.activation = activation
+        self.dropout = torch.nn.Dropout(dropout_rate)
+        self.w_2 = torch.nn.Linear(hidden_units, idim)
+        self.ada_scale = None
+        self.ada_bias = None
+        self.adaptive_scale = adaptive_scale
+        self.ada_scale = torch.nn.Parameter(
+            torch.ones([1, 1, idim]), requires_grad=adaptive_scale
+        )
+        self.ada_bias = torch.nn.Parameter(
+            torch.zeros([1, 1, idim]), requires_grad=adaptive_scale
+        )
+        if init_weights:
+            self.init_weights()
+    def init_weights(self):
+        ffn1_max = self.idim**-0.5
+        ffn2_max = self.hidden_units**-0.5
+        torch.nn.init.uniform_(self.w_1.weight.data, -ffn1_max, ffn1_max)
+        torch.nn.init.uniform_(self.w_1.bias.data, -ffn1_max, ffn1_max)
+        torch.nn.init.uniform_(self.w_2.weight.data, -ffn2_max, ffn2_max)
+        torch.nn.init.uniform_(self.w_2.bias.data, -ffn2_max, ffn2_max)
+    def forward(self, xs: torch.Tensor) -> torch.Tensor:
+        """Forward function.
+        Args:
+            xs: input tensor (B, L, D)
+        Returns:
+            output tensor, (B, L, D)
+        """
+        if self.adaptive_scale:
+            xs = self.ada_scale * xs + self.ada_bias
+        return self.w_2(self.dropout(self.activation(self.w_1(xs))))

Amphion/modules/wenet_extractor/transformer/decoder_layer.py ADDED Viewed

	@@ -0,0 +1,140 @@

+# This module is from [WeNet](https://github.com/wenet-e2e/wenet).
+# ## Citations
+# ```bibtex
+# @inproceedings{yao2021wenet,
+#   title={WeNet: Production oriented Streaming and Non-streaming End-to-End Speech Recognition Toolkit},
+#   author={Yao, Zhuoyuan and Wu, Di and Wang, Xiong and Zhang, Binbin and Yu, Fan and Yang, Chao and Peng, Zhendong and Chen, Xiaoyu and Xie, Lei and Lei, Xin},
+#   booktitle={Proc. Interspeech},
+#   year={2021},
+#   address={Brno, Czech Republic },
+#   organization={IEEE}
+# }
+# @article{zhang2022wenet,
+#   title={WeNet 2.0: More Productive End-to-End Speech Recognition Toolkit},
+#   author={Zhang, Binbin and Wu, Di and Peng, Zhendong and Song, Xingchen and Yao, Zhuoyuan and Lv, Hang and Xie, Lei and Yang, Chao and Pan, Fuping and Niu, Jianwei},
+#   journal={arXiv preprint arXiv:2203.15455},
+#   year={2022}
+# }
+#
+"""Decoder self-attention layer definition."""
+from typing import Optional, Tuple
+import torch
+from torch import nn
+class DecoderLayer(nn.Module):
+    """Single decoder layer module.
+    Args:
+        size (int): Input dimension.
+        self_attn (torch.nn.Module): Self-attention module instance.
+            `MultiHeadedAttention` instance can be used as the argument.
+        src_attn (torch.nn.Module): Inter-attention module instance.
+            `MultiHeadedAttention` instance can be used as the argument.
+            If `None` is passed, Inter-attention is not used, such as
+            CIF, GPT, and other decoder only model.
+        feed_forward (torch.nn.Module): Feed-forward module instance.
+            `PositionwiseFeedForward` instance can be used as the argument.
+        dropout_rate (float): Dropout rate.
+        normalize_before (bool):
+            True: use layer_norm before each sub-block.
+            False: to use layer_norm after each sub-block.
+    """
+    def __init__(
+        self,
+        size: int,
+        self_attn: nn.Module,
+        src_attn: Optional[nn.Module],
+        feed_forward: nn.Module,
+        dropout_rate: float,
+        normalize_before: bool = True,
+    ):
+        """Construct an DecoderLayer object."""
+        super().__init__()
+        self.size = size
+        self.self_attn = self_attn
+        self.src_attn = src_attn
+        self.feed_forward = feed_forward
+        self.norm1 = nn.LayerNorm(size, eps=1e-5)
+        self.norm2 = nn.LayerNorm(size, eps=1e-5)
+        self.norm3 = nn.LayerNorm(size, eps=1e-5)
+        self.dropout = nn.Dropout(dropout_rate)
+        self.normalize_before = normalize_before
+    def forward(
+        self,
+        tgt: torch.Tensor,
+        tgt_mask: torch.Tensor,
+        memory: torch.Tensor,
+        memory_mask: torch.Tensor,
+        cache: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Compute decoded features.
+        Args:
+            tgt (torch.Tensor): Input tensor (#batch, maxlen_out, size).
+            tgt_mask (torch.Tensor): Mask for input tensor
+                (#batch, maxlen_out).
+            memory (torch.Tensor): Encoded memory
+                (#batch, maxlen_in, size).
+            memory_mask (torch.Tensor): Encoded memory mask
+                (#batch, maxlen_in).
+            cache (torch.Tensor): cached tensors.
+                (#batch, maxlen_out - 1, size).
+        Returns:
+            torch.Tensor: Output tensor (#batch, maxlen_out, size).
+            torch.Tensor: Mask for output tensor (#batch, maxlen_out).
+            torch.Tensor: Encoded memory (#batch, maxlen_in, size).
+            torch.Tensor: Encoded memory mask (#batch, maxlen_in).
+        """
+        residual = tgt
+        if self.normalize_before:
+            tgt = self.norm1(tgt)
+        if cache is None:
+            tgt_q = tgt
+            tgt_q_mask = tgt_mask
+        else:
+            # compute only the last frame query keeping dim: max_time_out -> 1
+            assert cache.shape == (
+                tgt.shape[0],
+                tgt.shape[1] - 1,
+                self.size,
+            ), "{cache.shape} == {(tgt.shape[0], tgt.shape[1] - 1, self.size)}"
+            tgt_q = tgt[:, -1:, :]
+            residual = residual[:, -1:, :]
+            tgt_q_mask = tgt_mask[:, -1:, :]
+        x = residual + self.dropout(self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)[0])
+        if not self.normalize_before:
+            x = self.norm1(x)
+        if self.src_attn is not None:
+            residual = x
+            if self.normalize_before:
+                x = self.norm2(x)
+            x = residual + self.dropout(
+                self.src_attn(x, memory, memory, memory_mask)[0]
+            )
+            if not self.normalize_before:
+                x = self.norm2(x)
+        residual = x
+        if self.normalize_before:
+            x = self.norm3(x)
+        x = residual + self.dropout(self.feed_forward(x))
+        if not self.normalize_before:
+            x = self.norm3(x)
+        if cache is not None:
+            x = torch.cat([cache, x], dim=1)
+        return x, tgt_mask, memory, memory_mask

Amphion/modules/wenet_extractor/transformer/subsampling.py ADDED Viewed

	@@ -0,0 +1,257 @@

+# This module is from [WeNet](https://github.com/wenet-e2e/wenet).
+# ## Citations
+# ```bibtex
+# @inproceedings{yao2021wenet,
+#   title={WeNet: Production oriented Streaming and Non-streaming End-to-End Speech Recognition Toolkit},
+#   author={Yao, Zhuoyuan and Wu, Di and Wang, Xiong and Zhang, Binbin and Yu, Fan and Yang, Chao and Peng, Zhendong and Chen, Xiaoyu and Xie, Lei and Lei, Xin},
+#   booktitle={Proc. Interspeech},
+#   year={2021},
+#   address={Brno, Czech Republic },
+#   organization={IEEE}
+# }
+# @article{zhang2022wenet,
+#   title={WeNet 2.0: More Productive End-to-End Speech Recognition Toolkit},
+#   author={Zhang, Binbin and Wu, Di and Peng, Zhendong and Song, Xingchen and Yao, Zhuoyuan and Lv, Hang and Xie, Lei and Yang, Chao and Pan, Fuping and Niu, Jianwei},
+#   journal={arXiv preprint arXiv:2203.15455},
+#   year={2022}
+# }
+#
+"""Subsampling layer definition."""
+from typing import Tuple, Union
+import torch
+class BaseSubsampling(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.right_context = 0
+        self.subsampling_rate = 1
+    def position_encoding(
+        self, offset: Union[int, torch.Tensor], size: int
+    ) -> torch.Tensor:
+        return self.pos_enc.position_encoding(offset, size)
+class LinearNoSubsampling(BaseSubsampling):
+    """Linear transform the input without subsampling
+    Args:
+        idim (int): Input dimension.
+        odim (int): Output dimension.
+        dropout_rate (float): Dropout rate.
+    """
+    def __init__(
+        self, idim: int, odim: int, dropout_rate: float, pos_enc_class: torch.nn.Module
+    ):
+        """Construct an linear object."""
+        super().__init__()
+        self.out = torch.nn.Sequential(
+            torch.nn.Linear(idim, odim),
+            torch.nn.LayerNorm(odim, eps=1e-5),
+            torch.nn.Dropout(dropout_rate),
+        )
+        self.pos_enc = pos_enc_class
+        self.right_context = 0
+        self.subsampling_rate = 1
+    def forward(
+        self,
+        x: torch.Tensor,
+        x_mask: torch.Tensor,
+        offset: Union[int, torch.Tensor] = 0,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Input x.
+        Args:
+            x (torch.Tensor): Input tensor (#batch, time, idim).
+            x_mask (torch.Tensor): Input mask (#batch, 1, time).
+        Returns:
+            torch.Tensor: linear input tensor (#batch, time', odim),
+                where time' = time .
+            torch.Tensor: linear input mask (#batch, 1, time'),
+                where time' = time .
+        """
+        x = self.out(x)
+        x, pos_emb = self.pos_enc(x, offset)
+        return x, pos_emb, x_mask
+class Conv2dSubsampling4(BaseSubsampling):
+    """Convolutional 2D subsampling (to 1/4 length).
+    Args:
+        idim (int): Input dimension.
+        odim (int): Output dimension.
+        dropout_rate (float): Dropout rate.
+    """
+    def __init__(
+        self, idim: int, odim: int, dropout_rate: float, pos_enc_class: torch.nn.Module
+    ):
+        """Construct an Conv2dSubsampling4 object."""
+        super().__init__()
+        self.conv = torch.nn.Sequential(
+            torch.nn.Conv2d(1, odim, 3, 2),
+            torch.nn.ReLU(),
+            torch.nn.Conv2d(odim, odim, 3, 2),
+            torch.nn.ReLU(),
+        )
+        self.out = torch.nn.Sequential(
+            torch.nn.Linear(odim * (((idim - 1) // 2 - 1) // 2), odim)
+        )
+        self.pos_enc = pos_enc_class
+        # The right context for every conv layer is computed by:
+        # (kernel_size - 1) * frame_rate_of_this_layer
+        self.subsampling_rate = 4
+        # 6 = (3 - 1) * 1 + (3 - 1) * 2
+        self.right_context = 6
+    def forward(
+        self,
+        x: torch.Tensor,
+        x_mask: torch.Tensor,
+        offset: Union[int, torch.Tensor] = 0,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Subsample x.
+        Args:
+            x (torch.Tensor): Input tensor (#batch, time, idim).
+            x_mask (torch.Tensor): Input mask (#batch, 1, time).
+        Returns:
+            torch.Tensor: Subsampled tensor (#batch, time', odim),
+                where time' = time // 4.
+            torch.Tensor: Subsampled mask (#batch, 1, time'),
+                where time' = time // 4.
+            torch.Tensor: positional encoding
+        """
+        x = x.unsqueeze(1)  # (b, c=1, t, f)
+        x = self.conv(x)
+        b, c, t, f = x.size()
+        x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f))
+        x, pos_emb = self.pos_enc(x, offset)
+        return x, pos_emb, x_mask[:, :, 2::2][:, :, 2::2]
+class Conv2dSubsampling6(BaseSubsampling):
+    """Convolutional 2D subsampling (to 1/6 length).
+    Args:
+        idim (int): Input dimension.
+        odim (int): Output dimension.
+        dropout_rate (float): Dropout rate.
+        pos_enc (torch.nn.Module): Custom position encoding layer.
+    """
+    def __init__(
+        self, idim: int, odim: int, dropout_rate: float, pos_enc_class: torch.nn.Module
+    ):
+        """Construct an Conv2dSubsampling6 object."""
+        super().__init__()
+        self.conv = torch.nn.Sequential(
+            torch.nn.Conv2d(1, odim, 3, 2),
+            torch.nn.ReLU(),
+            torch.nn.Conv2d(odim, odim, 5, 3),
+            torch.nn.ReLU(),
+        )
+        self.linear = torch.nn.Linear(odim * (((idim - 1) // 2 - 2) // 3), odim)
+        self.pos_enc = pos_enc_class
+        # 10 = (3 - 1) * 1 + (5 - 1) * 2
+        self.subsampling_rate = 6
+        self.right_context = 10
+    def forward(
+        self,
+        x: torch.Tensor,
+        x_mask: torch.Tensor,
+        offset: Union[int, torch.Tensor] = 0,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Subsample x.
+        Args:
+            x (torch.Tensor): Input tensor (#batch, time, idim).
+            x_mask (torch.Tensor): Input mask (#batch, 1, time).
+        Returns:
+            torch.Tensor: Subsampled tensor (#batch, time', odim),
+                where time' = time // 6.
+            torch.Tensor: Subsampled mask (#batch, 1, time'),
+                where time' = time // 6.
+            torch.Tensor: positional encoding
+        """
+        x = x.unsqueeze(1)  # (b, c, t, f)
+        x = self.conv(x)
+        b, c, t, f = x.size()
+        x = self.linear(x.transpose(1, 2).contiguous().view(b, t, c * f))
+        x, pos_emb = self.pos_enc(x, offset)
+        return x, pos_emb, x_mask[:, :, 2::2][:, :, 4::3]
+class Conv2dSubsampling8(BaseSubsampling):
+    """Convolutional 2D subsampling (to 1/8 length).
+    Args:
+        idim (int): Input dimension.
+        odim (int): Output dimension.
+        dropout_rate (float): Dropout rate.
+    """
+    def __init__(
+        self, idim: int, odim: int, dropout_rate: float, pos_enc_class: torch.nn.Module
+    ):
+        """Construct an Conv2dSubsampling8 object."""
+        super().__init__()
+        self.conv = torch.nn.Sequential(
+            torch.nn.Conv2d(1, odim, 3, 2),
+            torch.nn.ReLU(),
+            torch.nn.Conv2d(odim, odim, 3, 2),
+            torch.nn.ReLU(),
+            torch.nn.Conv2d(odim, odim, 3, 2),
+            torch.nn.ReLU(),
+        )
+        self.linear = torch.nn.Linear(
+            odim * ((((idim - 1) // 2 - 1) // 2 - 1) // 2), odim
+        )
+        self.pos_enc = pos_enc_class
+        self.subsampling_rate = 8
+        # 14 = (3 - 1) * 1 + (3 - 1) * 2 + (3 - 1) * 4
+        self.right_context = 14
+    def forward(
+        self,
+        x: torch.Tensor,
+        x_mask: torch.Tensor,
+        offset: Union[int, torch.Tensor] = 0,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Subsample x.
+        Args:
+            x (torch.Tensor): Input tensor (#batch, time, idim).
+            x_mask (torch.Tensor): Input mask (#batch, 1, time).
+        Returns:
+            torch.Tensor: Subsampled tensor (#batch, time', odim),
+                where time' = time // 8.
+            torch.Tensor: Subsampled mask (#batch, 1, time'),
+                where time' = time // 8.
+            torch.Tensor: positional encoding
+        """
+        x = x.unsqueeze(1)  # (b, c, t, f)
+        x = self.conv(x)
+        b, c, t, f = x.size()
+        x = self.linear(x.transpose(1, 2).contiguous().view(b, t, c * f))
+        x, pos_emb = self.pos_enc(x, offset)
+        return x, pos_emb, x_mask[:, :, 2::2][:, :, 2::2][:, :, 2::2]

Amphion/modules/wenet_extractor/utils/__init__.py ADDED Viewed

File without changes

Amphion/preprocessors/cdmusiceval.py ADDED Viewed

	@@ -0,0 +1,174 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from glob import glob
+import os
+import json
+import torchaudio
+from tqdm import tqdm
+from collections import defaultdict
+from utils.util import has_existed, remove_and_create
+from utils.audio_slicer import split_utterances_from_audio
+def split_to_utterances(input_dir, output_dir):
+    print("Splitting to utterances for {}...".format(input_dir))
+    files_list = glob("*", root_dir=input_dir)
+    files_list.sort()
+    for wav_file in tqdm(files_list):
+        # # Load waveform
+        # waveform, fs = torchaudio.load(os.path.join(input_dir, wav_file))
+        # Singer name, Song name
+        song_name, singer_name = wav_file.split("_")[2].split("-")
+        save_dir = os.path.join(output_dir, singer_name, song_name)
+        split_utterances_from_audio(
+            os.path.join(input_dir, wav_file), save_dir, max_duration_of_utterance=10
+        )
+        # # Split
+        # slicer = Slicer(sr=fs, threshold=-30.0, max_sil_kept=3000, min_interval=1000)
+        # chunks = slicer.slice(waveform)
+        # for i, chunk in enumerate(chunks):
+        #     save_dir = os.path.join(output_dir, singer_name, song_name)
+        #     os.makedirs(save_dir, exist_ok=True)
+        #     output_file = os.path.join(save_dir, "{:04d}.wav".format(i))
+        #     save_audio(output_file, chunk, fs)
+def _main(dataset_path):
+    """
+    Split to utterances
+    """
+    utterance_dir = os.path.join(dataset_path, "utterances")
+    remove_and_create(utterance_dir)
+    split_to_utterances(os.path.join(dataset_path, "vocal"), utterance_dir)
+def statistics(utterance_dir):
+    singers = []
+    songs = []
+    singers2songs = defaultdict(lambda: defaultdict(list))
+    singer_infos = glob(utterance_dir + "/*")
+    for singer_info in singer_infos:
+        singer = singer_info.split("/")[-1]
+        song_infos = glob(singer_info + "/*")
+        for song_info in song_infos:
+            song = song_info.split("/")[-1]
+            singers.append(singer)
+            songs.append(song)
+            utts = glob(song_info + "/*.wav")
+            for utt in utts:
+                uid = utt.split("/")[-1].split(".")[0]
+                singers2songs[singer][song].append(uid)
+    unique_singers = list(set(singers))
+    unique_songs = list(set(songs))
+    unique_singers.sort()
+    unique_songs.sort()
+    print(
+        "Statistics: {} singers, {} utterances ({} unique songs)".format(
+            len(unique_singers), len(songs), len(unique_songs)
+        )
+    )
+    print("Singers: \n{}".format("\t".join(unique_singers)))
+    return singers2songs, unique_singers
+def main(output_path, dataset_path):
+    print("-" * 10)
+    print("Preparing samples for CD Music Eval...\n")
+    if not os.path.exists(os.path.join(dataset_path, "utterances")):
+        print("Spliting into utterances...\n")
+        _main(dataset_path)
+    save_dir = os.path.join(output_path, "cdmusiceval")
+    os.makedirs(save_dir, exist_ok=True)
+    train_output_file = os.path.join(save_dir, "train.json")
+    test_output_file = os.path.join(save_dir, "test.json")
+    singer_dict_file = os.path.join(save_dir, "singers.json")
+    utt2singer_file = os.path.join(save_dir, "utt2singer")
+    if (
+        has_existed(train_output_file)
+        and has_existed(test_output_file)
+        and has_existed(singer_dict_file)
+        and has_existed(utt2singer_file)
+    ):
+        return
+    utt2singer = open(utt2singer_file, "w")
+    # Load
+    utt_path = os.path.join(dataset_path, "utterances")
+    singers2songs, unique_singers = statistics(utt_path)
+    # We select songs of standard samples as test songs
+    train = []
+    test = []
+    train_index_count = 0
+    test_index_count = 0
+    train_total_duration = 0
+    test_total_duration = 0
+    for singer, songs in tqdm(singers2songs.items()):
+        song_names = list(songs.keys())
+        for chosen_song in song_names:
+            for chosen_uid in songs[chosen_song]:
+                res = {
+                    "Dataset": "cdmusiceval",
+                    "Singer": singer,
+                    "Uid": "{}_{}_{}".format(singer, chosen_song, chosen_uid),
+                }
+                res["Path"] = "{}/{}/{}.wav".format(singer, chosen_song, chosen_uid)
+                res["Path"] = os.path.join(utt_path, res["Path"])
+                assert os.path.exists(res["Path"])
+                waveform, sample_rate = torchaudio.load(res["Path"])
+                duration = waveform.size(-1) / sample_rate
+                res["Duration"] = duration
+                if duration <= 1e-8:
+                    continue
+                res["index"] = test_index_count
+                test_total_duration += duration
+                test.append(res)
+                test_index_count += 1
+                utt2singer.write("{}\t{}\n".format(res["Uid"], res["Singer"]))
+    print("#Train = {}, #Test = {}".format(len(train), len(test)))
+    print(
+        "#Train hours= {}, #Test hours= {}".format(
+            train_total_duration / 3600, test_total_duration / 3600
+        )
+    )
+    # Save train.json and test.json
+    with open(train_output_file, "w") as f:
+        json.dump(train, f, indent=4, ensure_ascii=False)
+    with open(test_output_file, "w") as f:
+        json.dump(test, f, indent=4, ensure_ascii=False)
+    # Save singers.json
+    singer_lut = {name: i for i, name in enumerate(unique_singers)}
+    with open(singer_dict_file, "w") as f:
+        json.dump(singer_lut, f, indent=4, ensure_ascii=False)

Amphion/utils/data_utils.py ADDED Viewed

	@@ -0,0 +1,588 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import json
+import os
+import numpy as np
+from scipy.interpolate import interp1d
+from tqdm import tqdm
+from sklearn.preprocessing import StandardScaler
+def intersperse(lst, item):
+    """
+    Insert an item in between any two consecutive elements of the given list, including beginning and end of list
+    Example:
+        >>> intersperse(0, [1, 74, 5, 31])
+            [0, 1, 0, 74, 0, 5, 0, 31, 0]
+    """
+    result = [item] * (len(lst) * 2 + 1)
+    result[1::2] = lst
+    return result
+def load_content_feature_path(meta_data, processed_dir, feat_dir):
+    utt2feat_path = {}
+    for utt_info in meta_data:
+        utt = utt_info["Dataset"] + "_" + utt_info["Uid"]
+        feat_path = os.path.join(
+            processed_dir, utt_info["Dataset"], feat_dir, f'{utt_info["Uid"]}.npy'
+        )
+        utt2feat_path[utt] = feat_path
+    return utt2feat_path
+def load_source_content_feature_path(meta_data, feat_dir):
+    utt2feat_path = {}
+    for utt in meta_data:
+        feat_path = os.path.join(feat_dir, f"{utt}.npy")
+        utt2feat_path[utt] = feat_path
+    return utt2feat_path
+def get_spk_map(spk2id_path, utt2spk_path):
+    utt2spk = {}
+    with open(spk2id_path, "r") as spk2id_file:
+        spk2id = json.load(spk2id_file)
+    with open(utt2spk_path, encoding="utf-8") as f:
+        for line in f.readlines():
+            utt, spk = line.strip().split("\t")
+            utt2spk[utt] = spk
+    return spk2id, utt2spk
+def get_target_f0_median(f0_dir):
+    total_f0 = []
+    for utt in os.listdir(f0_dir):
+        if not utt.endswith(".npy"):
+            continue
+        f0_feat_path = os.path.join(f0_dir, utt)
+        f0 = np.load(f0_feat_path)
+        total_f0 += f0.tolist()
+    total_f0 = np.array(total_f0)
+    voiced_position = np.where(total_f0 != 0)
+    return np.median(total_f0[voiced_position])
+def get_conversion_f0_factor(source_f0, target_median, source_median=None):
+    """Align the median between source f0 and target f0
+    Note: Here we use multiplication, whose factor is target_median/source_median
+    Reference: Frequency and pitch interval
+    http://blog.ccyg.studio/article/be12c2ee-d47c-4098-9782-ca76da3035e4/
+    """
+    if source_median is None:
+        voiced_position = np.where(source_f0 != 0)
+        source_median = np.median(source_f0[voiced_position])
+    factor = target_median / source_median
+    return source_median, factor
+def transpose_key(frame_pitch, trans_key):
+    # Transpose by user's argument
+    print("Transpose key = {} ...\n".format(trans_key))
+    transed_pitch = frame_pitch * 2 ** (trans_key / 12)
+    return transed_pitch
+def pitch_shift_to_target(frame_pitch, target_pitch_median, source_pitch_median=None):
+    # Loading F0 Base (median) and shift
+    source_pitch_median, factor = get_conversion_f0_factor(
+        frame_pitch, target_pitch_median, source_pitch_median
+    )
+    print(
+        "Auto transposing: source f0 median = {:.1f}, target f0 median = {:.1f}, factor = {:.2f}".format(
+            source_pitch_median, target_pitch_median, factor
+        )
+    )
+    transed_pitch = frame_pitch * factor
+    return transed_pitch
+def load_frame_pitch(
+    meta_data,
+    processed_dir,
+    pitch_dir,
+    use_log_scale=False,
+    return_norm=False,
+    interoperate=False,
+    utt2spk=None,
+):
+    utt2pitch = {}
+    utt2uv = {}
+    if utt2spk is None:
+        pitch_scaler = StandardScaler()
+        for utt_info in meta_data:
+            utt = utt_info["Dataset"] + "_" + utt_info["Uid"]
+            pitch_path = os.path.join(
+                processed_dir, utt_info["Dataset"], pitch_dir, f'{utt_info["Uid"]}.npy'
+            )
+            pitch = np.load(pitch_path)
+            assert len(pitch) > 0
+            uv = pitch != 0
+            utt2uv[utt] = uv
+            if use_log_scale:
+                nonzero_idxes = np.where(pitch != 0)[0]
+                pitch[nonzero_idxes] = np.log(pitch[nonzero_idxes])
+            utt2pitch[utt] = pitch
+            pitch_scaler.partial_fit(pitch.reshape(-1, 1))
+        mean, std = pitch_scaler.mean_[0], pitch_scaler.scale_[0]
+        if return_norm:
+            for utt_info in meta_data:
+                utt = utt_info["Dataset"] + "_" + utt_info["Uid"]
+                pitch = utt2pitch[utt]
+                normalized_pitch = (pitch - mean) / std
+                utt2pitch[utt] = normalized_pitch
+        pitch_statistic = {"mean": mean, "std": std}
+    else:
+        spk2utt = {}
+        pitch_statistic = []
+        for utt_info in meta_data:
+            utt = utt_info["Dataset"] + "_" + utt_info["Uid"]
+            if not utt2spk[utt] in spk2utt:
+                spk2utt[utt2spk[utt]] = []
+            spk2utt[utt2spk[utt]].append(utt)
+        for spk in spk2utt:
+            pitch_scaler = StandardScaler()
+            for utt in spk2utt[spk]:
+                dataset = utt.split("_")[0]
+                uid = "_".join(utt.split("_")[1:])
+                pitch_path = os.path.join(
+                    processed_dir, dataset, pitch_dir, f"{uid}.npy"
+                )
+                pitch = np.load(pitch_path)
+                assert len(pitch) > 0
+                uv = pitch != 0
+                utt2uv[utt] = uv
+                if use_log_scale:
+                    nonzero_idxes = np.where(pitch != 0)[0]
+                    pitch[nonzero_idxes] = np.log(pitch[nonzero_idxes])
+                utt2pitch[utt] = pitch
+                pitch_scaler.partial_fit(pitch.reshape(-1, 1))
+            mean, std = pitch_scaler.mean_[0], pitch_scaler.scale_[0]
+            if return_norm:
+                for utt in spk2utt[spk]:
+                    pitch = utt2pitch[utt]
+                    normalized_pitch = (pitch - mean) / std
+                    utt2pitch[utt] = normalized_pitch
+            pitch_statistic.append({"spk": spk, "mean": mean, "std": std})
+    return utt2pitch, utt2uv, pitch_statistic
+# discard
+def load_phone_pitch(
+    meta_data,
+    processed_dir,
+    pitch_dir,
+    utt2dur,
+    use_log_scale=False,
+    return_norm=False,
+    interoperate=True,
+    utt2spk=None,
+):
+    print("Load Phone Pitch")
+    utt2pitch = {}
+    utt2uv = {}
+    if utt2spk is None:
+        pitch_scaler = StandardScaler()
+        for utt_info in tqdm(meta_data):
+            utt = utt_info["Dataset"] + "_" + utt_info["Uid"]
+            pitch_path = os.path.join(
+                processed_dir, utt_info["Dataset"], pitch_dir, f'{utt_info["Uid"]}.npy'
+            )
+            frame_pitch = np.load(pitch_path)
+            assert len(frame_pitch) > 0
+            uv = frame_pitch != 0
+            utt2uv[utt] = uv
+            phone_pitch = phone_average_pitch(frame_pitch, utt2dur[utt], interoperate)
+            if use_log_scale:
+                nonzero_idxes = np.where(phone_pitch != 0)[0]
+                phone_pitch[nonzero_idxes] = np.log(phone_pitch[nonzero_idxes])
+            utt2pitch[utt] = phone_pitch
+            pitch_scaler.partial_fit(remove_outlier(phone_pitch).reshape(-1, 1))
+        mean, std = pitch_scaler.mean_[0], pitch_scaler.scale_[0]
+        max_value = np.finfo(np.float64).min
+        min_value = np.finfo(np.float64).max
+        if return_norm:
+            for utt_info in meta_data:
+                utt = utt_info["Dataset"] + "_" + utt_info["Uid"]
+                pitch = utt2pitch[utt]
+                normalized_pitch = (pitch - mean) / std
+                max_value = max(max_value, max(normalized_pitch))
+                min_value = min(min_value, min(normalized_pitch))
+                utt2pitch[utt] = normalized_pitch
+                phone_normalized_pitch_path = os.path.join(
+                    processed_dir,
+                    utt_info["Dataset"],
+                    "phone_level_" + pitch_dir,
+                    f'{utt_info["Uid"]}.npy',
+                )
+        pitch_statistic = {
+            "mean": mean,
+            "std": std,
+            "min_value": min_value,
+            "max_value": max_value,
+        }
+    else:
+        spk2utt = {}
+        pitch_statistic = []
+        for utt_info in tqdm(meta_data):
+            utt = utt_info["Dataset"] + "_" + utt_info["Uid"]
+            if not utt2spk[utt] in spk2utt:
+                spk2utt[utt2spk[utt]] = []
+            spk2utt[utt2spk[utt]].append(utt)
+        for spk in spk2utt:
+            pitch_scaler = StandardScaler()
+            for utt in spk2utt[spk]:
+                dataset = utt.split("_")[0]
+                uid = "_".join(utt.split("_")[1:])
+                pitch_path = os.path.join(
+                    processed_dir, dataset, pitch_dir, f"{uid}.npy"
+                )
+                frame_pitch = np.load(pitch_path)
+                assert len(frame_pitch) > 0
+                uv = frame_pitch != 0
+                utt2uv[utt] = uv
+                phone_pitch = phone_average_pitch(
+                    frame_pitch, utt2dur[utt], interoperate
+                )
+                if use_log_scale:
+                    nonzero_idxes = np.where(phone_pitch != 0)[0]
+                    phone_pitch[nonzero_idxes] = np.log(phone_pitch[nonzero_idxes])
+                utt2pitch[utt] = phone_pitch
+                pitch_scaler.partial_fit(remove_outlier(phone_pitch).reshape(-1, 1))
+            mean, std = pitch_scaler.mean_[0], pitch_scaler.scale_[0]
+            max_value = np.finfo(np.float64).min
+            min_value = np.finfo(np.float64).max
+            if return_norm:
+                for utt in spk2utt[spk]:
+                    pitch = utt2pitch[utt]
+                    normalized_pitch = (pitch - mean) / std
+                    max_value = max(max_value, max(normalized_pitch))
+                    min_value = min(min_value, min(normalized_pitch))
+                    utt2pitch[utt] = normalized_pitch
+            pitch_statistic.append(
+                {
+                    "spk": spk,
+                    "mean": mean,
+                    "std": std,
+                    "min_value": min_value,
+                    "max_value": max_value,
+                }
+            )
+    return utt2pitch, utt2uv, pitch_statistic
+def phone_average_pitch(pitch, dur, interoperate=False):
+    pos = 0
+    if interoperate:
+        nonzero_ids = np.where(pitch != 0)[0]
+        interp_fn = interp1d(
+            nonzero_ids,
+            pitch[nonzero_ids],
+            fill_value=(pitch[nonzero_ids[0]], pitch[nonzero_ids[-1]]),
+            bounds_error=False,
+        )
+        pitch = interp_fn(np.arange(0, len(pitch)))
+    phone_pitch = np.zeros(len(dur))
+    for i, d in enumerate(dur):
+        d = int(d)
+        if d > 0 and pos < len(pitch):
+            phone_pitch[i] = np.mean(pitch[pos : pos + d])
+        else:
+            phone_pitch[i] = 0
+        pos += d
+    return phone_pitch
+def load_energy(
+    meta_data,
+    processed_dir,
+    energy_dir,
+    use_log_scale=False,
+    return_norm=False,
+    utt2spk=None,
+):
+    utt2energy = {}
+    if utt2spk is None:
+        for utt_info in meta_data:
+            utt = utt_info["Dataset"] + "_" + utt_info["Uid"]
+            energy_path = os.path.join(
+                processed_dir, utt_info["Dataset"], energy_dir, f'{utt_info["Uid"]}.npy'
+            )
+            if not os.path.exists(energy_path):
+                continue
+            energy = np.load(energy_path)
+            assert len(energy) > 0
+            if use_log_scale:
+                nonzero_idxes = np.where(energy != 0)[0]
+                energy[nonzero_idxes] = np.log(energy[nonzero_idxes])
+            utt2energy[utt] = energy
+        if return_norm:
+            with open(
+                os.path.join(
+                    processed_dir, utt_info["Dataset"], energy_dir, "statistics.json"
+                )
+            ) as f:
+                stats = json.load(f)
+                mean, std = (
+                    stats[utt_info["Dataset"] + "_" + utt_info["Singer"]][
+                        "voiced_positions"
+                    ]["mean"],
+                    stats["LJSpeech_LJSpeech"]["voiced_positions"]["std"],
+                )
+            for utt in utt2energy.keys():
+                energy = utt2energy[utt]
+                normalized_energy = (energy - mean) / std
+                utt2energy[utt] = normalized_energy
+        energy_statistic = {"mean": mean, "std": std}
+    else:
+        spk2utt = {}
+        energy_statistic = []
+        for utt_info in meta_data:
+            utt = utt_info["Dataset"] + "_" + utt_info["Uid"]
+            if not utt2spk[utt] in spk2utt:
+                spk2utt[utt2spk[utt]] = []
+            spk2utt[utt2spk[utt]].append(utt)
+        for spk in spk2utt:
+            energy_scaler = StandardScaler()
+            for utt in spk2utt[spk]:
+                dataset = utt.split("_")[0]
+                uid = "_".join(utt.split("_")[1:])
+                energy_path = os.path.join(
+                    processed_dir, dataset, energy_dir, f"{uid}.npy"
+                )
+                if not os.path.exists(energy_path):
+                    continue
+                frame_energy = np.load(energy_path)
+                assert len(frame_energy) > 0
+                if use_log_scale:
+                    nonzero_idxes = np.where(frame_energy != 0)[0]
+                    frame_energy[nonzero_idxes] = np.log(frame_energy[nonzero_idxes])
+                utt2energy[utt] = frame_energy
+                energy_scaler.partial_fit(frame_energy.reshape(-1, 1))
+            mean, std = energy_scaler.mean_[0], energy_scaler.scale_[0]
+            if return_norm:
+                for utt in spk2utt[spk]:
+                    energy = utt2energy[utt]
+                    normalized_energy = (energy - mean) / std
+                    utt2energy[utt] = normalized_energy
+            energy_statistic.append({"spk": spk, "mean": mean, "std": std})
+    return utt2energy, energy_statistic
+def load_frame_energy(
+    meta_data,
+    processed_dir,
+    energy_dir,
+    use_log_scale=False,
+    return_norm=False,
+    interoperate=False,
+    utt2spk=None,
+):
+    utt2energy = {}
+    if utt2spk is None:
+        energy_scaler = StandardScaler()
+        for utt_info in meta_data:
+            utt = utt_info["Dataset"] + "_" + utt_info["Uid"]
+            energy_path = os.path.join(
+                processed_dir, utt_info["Dataset"], energy_dir, f'{utt_info["Uid"]}.npy'
+            )
+            frame_energy = np.load(energy_path)
+            assert len(frame_energy) > 0
+            if use_log_scale:
+                nonzero_idxes = np.where(frame_energy != 0)[0]
+                frame_energy[nonzero_idxes] = np.log(frame_energy[nonzero_idxes])
+            utt2energy[utt] = frame_energy
+            energy_scaler.partial_fit(frame_energy.reshape(-1, 1))
+        mean, std = energy_scaler.mean_[0], energy_scaler.scale_[0]
+        if return_norm:
+            for utt_info in meta_data:
+                utt = utt_info["Dataset"] + "_" + utt_info["Uid"]
+                energy = utt2energy[utt]
+                normalized_energy = (energy - mean) / std
+                utt2energy[utt] = normalized_energy
+        energy_statistic = {"mean": mean, "std": std}
+    else:
+        spk2utt = {}
+        energy_statistic = []
+        for utt_info in meta_data:
+            utt = utt_info["Dataset"] + "_" + utt_info["Uid"]
+            if not utt2spk[utt] in spk2utt:
+                spk2utt[utt2spk[utt]] = []
+            spk2utt[utt2spk[utt]].append(utt)
+        for spk in spk2utt:
+            energy_scaler = StandardScaler()
+            for utt in spk2utt[spk]:
+                dataset = utt.split("_")[0]
+                uid = "_".join(utt.split("_")[1:])
+                energy_path = os.path.join(
+                    processed_dir, dataset, energy_dir, f"{uid}.npy"
+                )
+                frame_energy = np.load(energy_path)
+                assert len(frame_energy) > 0
+                if use_log_scale:
+                    nonzero_idxes = np.where(frame_energy != 0)[0]
+                    frame_energy[nonzero_idxes] = np.log(frame_energy[nonzero_idxes])
+                utt2energy[utt] = frame_energy
+                energy_scaler.partial_fit(frame_energy.reshape(-1, 1))
+            mean, std = energy_scaler.mean_[0], energy_scaler.scale_[0]
+            if return_norm:
+                for utt in spk2utt[spk]:
+                    energy = utt2energy[utt]
+                    normalized_energy = (energy - mean) / std
+                    utt2energy[utt] = normalized_energy
+            energy_statistic.append({"spk": spk, "mean": mean, "std": std})
+    return utt2energy, energy_statistic
+def align_length(feature, target_len, pad_value=0.0):
+    feature_len = feature.shape[-1]
+    dim = len(feature.shape)
+    # align 1-D data
+    if dim == 2:
+        if target_len > feature_len:
+            feature = np.pad(
+                feature,
+                ((0, 0), (0, target_len - feature_len)),
+                constant_values=pad_value,
+            )
+        else:
+            feature = feature[:, :target_len]
+    # align 2-D data
+    elif dim == 1:
+        if target_len > feature_len:
+            feature = np.pad(
+                feature, (0, target_len - feature_len), constant_values=pad_value
+            )
+        else:
+            feature = feature[:target_len]
+    else:
+        raise NotImplementedError
+    return feature
+def align_whisper_feauture_length(
+    feature, target_len, fast_mapping=True, source_hop=320, target_hop=256
+):
+    factor = np.gcd(source_hop, target_hop)
+    source_hop //= factor
+    target_hop //= factor
+    # print(
+    #     "Mapping source's {} frames => target's {} frames".format(
+    #         target_hop, source_hop
+    #     )
+    # )
+    max_source_len = 1500
+    target_len = min(target_len, max_source_len * source_hop // target_hop)
+    width = feature.shape[-1]
+    if fast_mapping:
+        source_len = target_len * target_hop // source_hop + 1
+        feature = feature[:source_len]
+    else:
+        source_len = max_source_len
+    # const ~= target_len * target_hop
+    const = source_len * source_hop // target_hop * target_hop
+    # (source_len * source_hop, dim)
+    up_sampling_feats = np.repeat(feature, source_hop, axis=0)
+    # (const, dim) -> (const/target_hop, target_hop, dim) -> (const/target_hop, dim)
+    down_sampling_feats = np.average(
+        up_sampling_feats[:const].reshape(-1, target_hop, width), axis=1
+    )
+    assert len(down_sampling_feats) >= target_len
+    # (target_len, dim)
+    feat = down_sampling_feats[:target_len]
+    return feat
+def align_content_feature_length(feature, target_len, source_hop=320, target_hop=256):
+    factor = np.gcd(source_hop, target_hop)
+    source_hop //= factor
+    target_hop //= factor
+    # print(
+    #     "Mapping source's {} frames => target's {} frames".format(
+    #         target_hop, source_hop
+    #     )
+    # )
+    # (source_len, 256)
+    source_len, width = feature.shape
+    # const ~= target_len * target_hop
+    const = source_len * source_hop // target_hop * target_hop
+    # (source_len * source_hop, dim)
+    up_sampling_feats = np.repeat(feature, source_hop, axis=0)
+    # (const, dim) -> (const/target_hop, target_hop, dim) -> (const/target_hop, dim)
+    down_sampling_feats = np.average(
+        up_sampling_feats[:const].reshape(-1, target_hop, width), axis=1
+    )
+    err = abs(target_len - len(down_sampling_feats))
+    if err > 4:  ## why 4 not 3?
+        print("target_len:", target_len)
+        print("raw feature:", feature.shape)
+        print("up_sampling:", up_sampling_feats.shape)
+        print("down_sampling_feats:", down_sampling_feats.shape)
+        exit()
+    if len(down_sampling_feats) < target_len:
+        # (1, dim) -> (err, dim)
+        end = down_sampling_feats[-1][None, :].repeat(err, axis=0)
+        down_sampling_feats = np.concatenate([down_sampling_feats, end], axis=0)
+    # (target_len, dim)
+    feat = down_sampling_feats[:target_len]
+    return feat
+def remove_outlier(values):
+    values = np.array(values)
+    p25 = np.percentile(values, 25)
+    p75 = np.percentile(values, 75)
+    lower = p25 - 1.5 * (p75 - p25)
+    upper = p75 + 1.5 * (p75 - p25)
+    normal_indices = np.logical_and(values > lower, values < upper)
+    return values[normal_indices]

Amphion/utils/distribution.py ADDED Viewed

	@@ -0,0 +1,270 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch.distributions import Normal
+def log_sum_exp(x):
+    """numerically stable log_sum_exp implementation that prevents overflow"""
+    # TF ordering
+    axis = len(x.size()) - 1
+    m, _ = torch.max(x, dim=axis)
+    m2, _ = torch.max(x, dim=axis, keepdim=True)
+    return m + torch.log(torch.sum(torch.exp(x - m2), dim=axis))
+def discretized_mix_logistic_loss(
+    y_hat, y, num_classes=256, log_scale_min=-7.0, reduce=True
+):
+    """Discretized mixture of logistic distributions loss
+    Note that it is assumed that input is scaled to [-1, 1].
+    Args:
+        y_hat (Tensor): Predicted output (B x C x T)
+        y (Tensor): Target (B x T x 1).
+        num_classes (int): Number of classes
+        log_scale_min (float): Log scale minimum value
+        reduce (bool): If True, the losses are averaged or summed for each
+          minibatch.
+    Returns
+        Tensor: loss
+    """
+    assert y_hat.dim() == 3
+    assert y_hat.size(1) % 3 == 0
+    nr_mix = y_hat.size(1) // 3
+    # (B x T x C)
+    y_hat = y_hat.transpose(1, 2)
+    # unpack parameters. (B, T, num_mixtures) x 3
+    logit_probs = y_hat[:, :, :nr_mix]
+    means = y_hat[:, :, nr_mix : 2 * nr_mix]
+    log_scales = torch.clamp(y_hat[:, :, 2 * nr_mix : 3 * nr_mix], min=log_scale_min)
+    # B x T x 1 -> B x T x num_mixtures
+    y = y.expand_as(means)
+    centered_y = y - means
+    inv_stdv = torch.exp(-log_scales)
+    plus_in = inv_stdv * (centered_y + 1.0 / (num_classes - 1))
+    cdf_plus = torch.sigmoid(plus_in)
+    min_in = inv_stdv * (centered_y - 1.0 / (num_classes - 1))
+    cdf_min = torch.sigmoid(min_in)
+    # log probability for edge case of 0 (before scaling)
+    # equivalent: torch.log(torch.sigmoid(plus_in))
+    log_cdf_plus = plus_in - F.softplus(plus_in)
+    # log probability for edge case of 255 (before scaling)
+    # equivalent: (1 - torch.sigmoid(min_in)).log()
+    log_one_minus_cdf_min = -F.softplus(min_in)
+    # probability for all other cases
+    cdf_delta = cdf_plus - cdf_min
+    mid_in = inv_stdv * centered_y
+    # log probability in the center of the bin, to be used in extreme cases
+    # (not actually used in our code)
+    log_pdf_mid = mid_in - log_scales - 2.0 * F.softplus(mid_in)
+    # tf equivalent
+    """
+    log_probs = tf.where(x < -0.999, log_cdf_plus,
+                         tf.where(x > 0.999, log_one_minus_cdf_min,
+                                  tf.where(cdf_delta > 1e-5,
+                                           tf.log(tf.maximum(cdf_delta, 1e-12)),
+                                           log_pdf_mid - np.log(127.5))))
+    """
+    # TODO: cdf_delta <= 1e-5 actually can happen. How can we choose the value
+    # for num_classes=65536 case? 1e-7? not sure..
+    inner_inner_cond = (cdf_delta > 1e-5).float()
+    inner_inner_out = inner_inner_cond * torch.log(
+        torch.clamp(cdf_delta, min=1e-12)
+    ) + (1.0 - inner_inner_cond) * (log_pdf_mid - np.log((num_classes - 1) / 2))
+    inner_cond = (y > 0.999).float()
+    inner_out = (
+        inner_cond * log_one_minus_cdf_min + (1.0 - inner_cond) * inner_inner_out
+    )
+    cond = (y < -0.999).float()
+    log_probs = cond * log_cdf_plus + (1.0 - cond) * inner_out
+    log_probs = log_probs + F.log_softmax(logit_probs, -1)
+    if reduce:
+        return -torch.sum(log_sum_exp(log_probs))
+    else:
+        return -log_sum_exp(log_probs).unsqueeze(-1)
+def to_one_hot(tensor, n, fill_with=1.0):
+    # we perform one hot encore with respect to the last axis
+    one_hot = torch.FloatTensor(tensor.size() + (n,)).zero_()
+    if tensor.is_cuda:
+        one_hot = one_hot.cuda()
+    one_hot.scatter_(len(tensor.size()), tensor.unsqueeze(-1), fill_with)
+    return one_hot
+def sample_from_discretized_mix_logistic(y, log_scale_min=-7.0, clamp_log_scale=False):
+    """
+    Sample from discretized mixture of logistic distributions
+    Args:
+        y (Tensor): B x C x T
+        log_scale_min (float): Log scale minimum value
+    Returns:
+        Tensor: sample in range of [-1, 1].
+    """
+    assert y.size(1) % 3 == 0
+    nr_mix = y.size(1) // 3
+    # B x T x C
+    y = y.transpose(1, 2)
+    logit_probs = y[:, :, :nr_mix]
+    # sample mixture indicator from softmax
+    temp = logit_probs.data.new(logit_probs.size()).uniform_(1e-5, 1.0 - 1e-5)
+    temp = logit_probs.data - torch.log(-torch.log(temp))
+    _, argmax = temp.max(dim=-1)
+    # (B, T) -> (B, T, nr_mix)
+    one_hot = to_one_hot(argmax, nr_mix)
+    # select logistic parameters
+    means = torch.sum(y[:, :, nr_mix : 2 * nr_mix] * one_hot, dim=-1)
+    log_scales = torch.sum(y[:, :, 2 * nr_mix : 3 * nr_mix] * one_hot, dim=-1)
+    if clamp_log_scale:
+        log_scales = torch.clamp(log_scales, min=log_scale_min)
+    # sample from logistic & clip to interval
+    # we don't actually round to the nearest 8bit value when sampling
+    u = means.data.new(means.size()).uniform_(1e-5, 1.0 - 1e-5)
+    x = means + torch.exp(log_scales) * (torch.log(u) - torch.log(1.0 - u))
+    x = torch.clamp(torch.clamp(x, min=-1.0), max=1.0)
+    return x
+# we can easily define discretized version of the gaussian loss, however,
+# use continuous version as same as the https://clarinet-demo.github.io/
+def mix_gaussian_loss(y_hat, y, log_scale_min=-7.0, reduce=True):
+    """Mixture of continuous gaussian distributions loss
+    Note that it is assumed that input is scaled to [-1, 1].
+    Args:
+        y_hat (Tensor): Predicted output (B x C x T)
+        y (Tensor): Target (B x T x 1).
+        log_scale_min (float): Log scale minimum value
+        reduce (bool): If True, the losses are averaged or summed for each
+          minibatch.
+    Returns
+        Tensor: loss
+    """
+    assert y_hat.dim() == 3
+    C = y_hat.size(1)
+    if C == 2:
+        nr_mix = 1
+    else:
+        assert y_hat.size(1) % 3 == 0
+        nr_mix = y_hat.size(1) // 3
+    # (B x T x C)
+    y_hat = y_hat.transpose(1, 2)
+    # unpack parameters.
+    if C == 2:
+        # special case for C == 2, just for compatibility
+        logit_probs = None
+        means = y_hat[:, :, 0:1]
+        log_scales = torch.clamp(y_hat[:, :, 1:2], min=log_scale_min)
+    else:
+        #  (B, T, num_mixtures) x 3
+        logit_probs = y_hat[:, :, :nr_mix]
+        means = y_hat[:, :, nr_mix : 2 * nr_mix]
+        log_scales = torch.clamp(
+            y_hat[:, :, 2 * nr_mix : 3 * nr_mix], min=log_scale_min
+        )
+    # B x T x 1 -> B x T x num_mixtures
+    y = y.expand_as(means)
+    centered_y = y - means
+    dist = Normal(loc=0.0, scale=torch.exp(log_scales))
+    # do we need to add a trick to avoid log(0)?
+    log_probs = dist.log_prob(centered_y)
+    if nr_mix > 1:
+        log_probs = log_probs + F.log_softmax(logit_probs, -1)
+    if reduce:
+        if nr_mix == 1:
+            return -torch.sum(log_probs)
+        else:
+            return -torch.sum(log_sum_exp(log_probs))
+    else:
+        if nr_mix == 1:
+            return -log_probs
+        else:
+            return -log_sum_exp(log_probs).unsqueeze(-1)
+def sample_from_mix_gaussian(y, log_scale_min=-7.0):
+    """
+    Sample from (discretized) mixture of gaussian distributions
+    Args:
+        y (Tensor): B x C x T
+        log_scale_min (float): Log scale minimum value
+    Returns:
+        Tensor: sample in range of [-1, 1].
+    """
+    C = y.size(1)
+    if C == 2:
+        nr_mix = 1
+    else:
+        assert y.size(1) % 3 == 0
+        nr_mix = y.size(1) // 3
+    # B x T x C
+    y = y.transpose(1, 2)
+    if C == 2:
+        logit_probs = None
+    else:
+        logit_probs = y[:, :, :nr_mix]
+    if nr_mix > 1:
+        # sample mixture indicator from softmax
+        temp = logit_probs.data.new(logit_probs.size()).uniform_(1e-5, 1.0 - 1e-5)
+        temp = logit_probs.data - torch.log(-torch.log(temp))
+        _, argmax = temp.max(dim=-1)
+        # (B, T) -> (B, T, nr_mix)
+        one_hot = to_one_hot(argmax, nr_mix)
+        # Select means and log scales
+        means = torch.sum(y[:, :, nr_mix : 2 * nr_mix] * one_hot, dim=-1)
+        log_scales = torch.sum(y[:, :, 2 * nr_mix : 3 * nr_mix] * one_hot, dim=-1)
+    else:
+        if C == 2:
+            means, log_scales = y[:, :, 0], y[:, :, 1]
+        elif C == 3:
+            means, log_scales = y[:, :, 1], y[:, :, 2]
+        else:
+            assert False, "shouldn't happen"
+    scales = torch.exp(log_scales)
+    dist = Normal(loc=means, scale=scales)
+    x = dist.sample()
+    x = torch.clamp(x, min=-1.0, max=1.0)
+    return x

Amphion/utils/mel.py ADDED Viewed

	@@ -0,0 +1,280 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+from librosa.filters import mel as librosa_mel_fn
+def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
+    # Min value: ln(1e-5) = -11.5129
+    return torch.log(torch.clamp(x, min=clip_val) * C)
+def spectral_normalize_torch(magnitudes):
+    output = dynamic_range_compression_torch(magnitudes)
+    return output
+def extract_linear_features(y, cfg, center=False):
+    if torch.min(y) < -1.0:
+        print("min value is ", torch.min(y))
+    if torch.max(y) > 1.0:
+        print("max value is ", torch.max(y))
+    global hann_window
+    hann_window[str(y.device)] = torch.hann_window(cfg.win_size).to(y.device)
+    y = torch.nn.functional.pad(
+        y.unsqueeze(1),
+        (int((cfg.n_fft - cfg.hop_size) / 2), int((cfg.n_fft - cfg.hop_size) / 2)),
+        mode="reflect",
+    )
+    y = y.squeeze(1)
+    # complex tensor as default, then use view_as_real for future pytorch compatibility
+    spec = torch.stft(
+        y,
+        cfg.n_fft,
+        hop_length=cfg.hop_size,
+        win_length=cfg.win_size,
+        window=hann_window[str(y.device)],
+        center=center,
+        pad_mode="reflect",
+        normalized=False,
+        onesided=True,
+        return_complex=True,
+    )
+    spec = torch.view_as_real(spec)
+    spec = torch.sqrt(spec.pow(2).sum(-1) + (1e-9))
+    spec = torch.squeeze(spec, 0)
+    return spec
+def mel_spectrogram_torch(y, cfg, center=False):
+    """
+    TODO: to merge this funtion with the extract_mel_features below
+    """
+    if torch.min(y) < -1.0:
+        print("min value is ", torch.min(y))
+    if torch.max(y) > 1.0:
+        print("max value is ", torch.max(y))
+    global mel_basis, hann_window
+    if cfg.fmax not in mel_basis:
+        mel = librosa_mel_fn(
+            sr=cfg.sample_rate,
+            n_fft=cfg.n_fft,
+            n_mels=cfg.n_mel,
+            fmin=cfg.fmin,
+            fmax=cfg.fmax,
+        )
+        mel_basis[str(cfg.fmax) + "_" + str(y.device)] = (
+            torch.from_numpy(mel).float().to(y.device)
+        )
+        hann_window[str(y.device)] = torch.hann_window(cfg.win_size).to(y.device)
+    y = torch.nn.functional.pad(
+        y.unsqueeze(1),
+        (int((cfg.n_fft - cfg.hop_size) / 2), int((cfg.n_fft - cfg.hop_size) / 2)),
+        mode="reflect",
+    )
+    y = y.squeeze(1)
+    spec = torch.stft(
+        y,
+        cfg.n_fft,
+        hop_length=cfg.hop_size,
+        win_length=cfg.win_size,
+        window=hann_window[str(y.device)],
+        center=center,
+        pad_mode="reflect",
+        normalized=False,
+        onesided=True,
+        return_complex=True,
+    )
+    spec = torch.view_as_real(spec)
+    spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
+    spec = torch.matmul(mel_basis[str(cfg.fmax) + "_" + str(y.device)], spec)
+    spec = spectral_normalize_torch(spec)
+    return spec
+mel_basis = {}
+hann_window = {}
+def extract_mel_features(
+    y,
+    cfg,
+    center=False,
+):
+    """Extract mel features
+    Args:
+        y (tensor): audio data in tensor
+        cfg (dict): configuration in cfg.preprocess
+        center (bool, optional): In STFT, whether t-th frame is centered at time t*hop_length. Defaults to False.
+    Returns:
+        tensor: a tensor containing the mel feature calculated based on STFT result
+    """
+    if torch.min(y) < -1.0:
+        print("min value is ", torch.min(y))
+    if torch.max(y) > 1.0:
+        print("max value is ", torch.max(y))
+    global mel_basis, hann_window
+    if cfg.fmax not in mel_basis:
+        mel = librosa_mel_fn(
+            sr=cfg.sample_rate,
+            n_fft=cfg.n_fft,
+            n_mels=cfg.n_mel,
+            fmin=cfg.fmin,
+            fmax=cfg.fmax,
+        )
+        mel_basis[str(cfg.fmax) + "_" + str(y.device)] = (
+            torch.from_numpy(mel).float().to(y.device)
+        )
+        hann_window[str(y.device)] = torch.hann_window(cfg.win_size).to(y.device)
+    y = torch.nn.functional.pad(
+        y.unsqueeze(1),
+        (int((cfg.n_fft - cfg.hop_size) / 2), int((cfg.n_fft - cfg.hop_size) / 2)),
+        mode="reflect",
+    )
+    y = y.squeeze(1)
+    # complex tensor as default, then use view_as_real for future pytorch compatibility
+    spec = torch.stft(
+        y,
+        cfg.n_fft,
+        hop_length=cfg.hop_size,
+        win_length=cfg.win_size,
+        window=hann_window[str(y.device)],
+        center=center,
+        pad_mode="reflect",
+        normalized=False,
+        onesided=True,
+        return_complex=True,
+    )
+    spec = torch.view_as_real(spec)
+    spec = torch.sqrt(spec.pow(2).sum(-1) + (1e-9))
+    spec = torch.matmul(mel_basis[str(cfg.fmax) + "_" + str(y.device)], spec)
+    spec = spectral_normalize_torch(spec)
+    return spec.squeeze(0)
+def extract_mel_features_tts(
+    y,
+    cfg,
+    center=False,
+    taco=False,
+    _stft=None,
+):
+    """Extract mel features
+    Args:
+        y (tensor): audio data in tensor
+        cfg (dict): configuration in cfg.preprocess
+        center (bool, optional): In STFT, whether t-th frame is centered at time t*hop_length. Defaults to False.
+        taco: use tacotron mel
+    Returns:
+        tensor: a tensor containing the mel feature calculated based on STFT result
+    """
+    if not taco:
+        if torch.min(y) < -1.0:
+            print("min value is ", torch.min(y))
+        if torch.max(y) > 1.0:
+            print("max value is ", torch.max(y))
+        global mel_basis, hann_window
+        if cfg.fmax not in mel_basis:
+            mel = librosa_mel_fn(
+                sr=cfg.sample_rate,
+                n_fft=cfg.n_fft,
+                n_mels=cfg.n_mel,
+                fmin=cfg.fmin,
+                fmax=cfg.fmax,
+            )
+            mel_basis[str(cfg.fmax) + "_" + str(y.device)] = (
+                torch.from_numpy(mel).float().to(y.device)
+            )
+            hann_window[str(y.device)] = torch.hann_window(cfg.win_size).to(y.device)
+        y = torch.nn.functional.pad(
+            y.unsqueeze(1),
+            (int((cfg.n_fft - cfg.hop_size) / 2), int((cfg.n_fft - cfg.hop_size) / 2)),
+            mode="reflect",
+        )
+        y = y.squeeze(1)
+        # complex tensor as default, then use view_as_real for future pytorch compatibility
+        spec = torch.stft(
+            y,
+            cfg.n_fft,
+            hop_length=cfg.hop_size,
+            win_length=cfg.win_size,
+            window=hann_window[str(y.device)],
+            center=center,
+            pad_mode="reflect",
+            normalized=False,
+            onesided=True,
+            return_complex=True,
+        )
+        spec = torch.view_as_real(spec)
+        spec = torch.sqrt(spec.pow(2).sum(-1) + (1e-9))
+        spec = torch.matmul(mel_basis[str(cfg.fmax) + "_" + str(y.device)], spec)
+        spec = spectral_normalize_torch(spec)
+    else:
+        audio = torch.clip(y, -1, 1)
+        audio = torch.autograd.Variable(audio, requires_grad=False)
+        spec, energy = _stft.mel_spectrogram(audio)
+    return spec.squeeze(0)
+def amplitude_phase_spectrum(y, cfg):
+    hann_window = torch.hann_window(cfg.win_size).to(y.device)
+    y = torch.nn.functional.pad(
+        y.unsqueeze(1),
+        (int((cfg.n_fft - cfg.hop_size) / 2), int((cfg.n_fft - cfg.hop_size) / 2)),
+        mode="reflect",
+    )
+    y = y.squeeze(1)
+    stft_spec = torch.stft(
+        y,
+        cfg.n_fft,
+        hop_length=cfg.hop_size,
+        win_length=cfg.win_size,
+        window=hann_window,
+        center=False,
+        return_complex=True,
+    )
+    stft_spec = torch.view_as_real(stft_spec)
+    if stft_spec.size()[0] == 1:
+        stft_spec = stft_spec.squeeze(0)
+    if len(list(stft_spec.size())) == 4:
+        rea = stft_spec[:, :, :, 0]  # [batch_size, n_fft//2+1, frames]
+        imag = stft_spec[:, :, :, 1]  # [batch_size, n_fft//2+1, frames]
+    else:
+        rea = stft_spec[:, :, 0]  # [n_fft//2+1, frames]
+        imag = stft_spec[:, :, 1]  # [n_fft//2+1, frames]
+    log_amplitude = torch.log(
+        torch.abs(torch.sqrt(torch.pow(rea, 2) + torch.pow(imag, 2))) + 1e-5
+    )  # [n_fft//2+1, frames]
+    phase = torch.atan2(imag, rea)  # [n_fft//2+1, frames]
+    return log_amplitude, phase, rea, imag

Amphion/utils/prompt_preparer.py ADDED Viewed

	@@ -0,0 +1,68 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+class PromptPreparer:
+    def prepare_prompts(self, y, y_lens, codes, nar_stage, y_prompts_codes):
+        if self.prefix_mode == 0:
+            y_emb, prefix_len = self._handle_prefix_mode_0(y, codes, nar_stage)
+        elif self.prefix_mode == 1:
+            y_emb, prefix_len = self._handle_prefix_mode_1(y, y_lens, codes, nar_stage)
+        elif self.prefix_mode in [2, 4]:
+            y_emb, prefix_len = self._handle_prefix_mode_2_4(
+                y, y_lens, codes, nar_stage, y_prompts_codes
+            )
+        else:
+            raise ValueError("Invalid prefix mode")
+        return y_emb, prefix_len
+    def _handle_prefix_mode_0(self, y, codes, nar_stage):
+        prefix_len = 0
+        y_emb = self.nar_audio_embeddings[0](y)
+        for j in range(1, nar_stage):
+            y_emb = y_emb + self.nar_audio_embeddings[j](codes[..., j])
+        return y_emb, 0
+    def _handle_prefix_mode_1(self, y, y_lens, codes, nar_stage):
+        int_low = (0.25 * y_lens.min()).type(torch.int64).item()
+        prefix_len = torch.randint(int_low, int_low * 2, size=()).item()
+        prefix_len = min(prefix_len, 225)
+        y_prompts = self.nar_audio_embeddings[0](y[:, :prefix_len])
+        y_emb = self.nar_audio_embeddings[0](y[:, prefix_len:])
+        for j in range(1, self.num_quantizers):
+            y_prompts += self.nar_audio_embeddings[j](codes[:, :prefix_len, j])
+            if j < nar_stage:
+                y_emb += self.nar_audio_embeddings[j](codes[:, prefix_len:, j])
+        y_emb = torch.concat([y_prompts, y_emb], axis=1)
+        return y_emb, prefix_len
+    def _handle_prefix_mode_2_4(self, y, y_lens, codes, nar_stage, y_prompts_codes):
+        if self.prefix_mode == 2:
+            prefix_len = min(225, int(0.25 * y_lens.min().item()))
+            y_prompts_codes = []
+            for b in range(codes.shape[0]):
+                start = self.rng.randint(0, y_lens[b].item() - prefix_len)
+                y_prompts_codes.append(
+                    torch.clone(codes[b, start : start + prefix_len])
+                )
+                codes[b, start : start + prefix_len, nar_stage] = self.audio_token_num
+            y_prompts_codes = torch.stack(y_prompts_codes, dim=0)
+        else:
+            prefix_len = y_prompts_codes.shape[1]
+        y_prompts = self.nar_audio_embeddings[0](y_prompts_codes[..., 0])
+        y_emb = self.nar_audio_embeddings[0](y)
+        for j in range(1, self.num_quantizers):
+            y_prompts += self.nar_audio_embeddings[j](y_prompts_codes[..., j])
+            if j < nar_stage:
+                y_emb += self.nar_audio_embeddings[j](codes[..., j])
+        y_emb = torch.concat([y_prompts, y_emb], axis=1)
+        return y_emb, prefix_len

__pycache__/model.cpython-310.pyc ADDED Viewed

Binary file (10.5 kB). View file

conf/default.yaml ADDED Viewed

	@@ -0,0 +1,70 @@

+is_left_ear: true
+loss_function: MultiResolutionL1SpecLoss
+trainer:
+  acoustic_hop_length: 256
+  acoustic_n_fft: 512
+  acoustic_sr: 16000
+  acoustic_win_length: 512
+  adam_beta1: 0.9
+  adam_beta2: 0.999
+  adam_epsilon: 1.0e-08
+  dataloader_drop_last: true
+  dataloader_num_workers: 8
+  dataloader_persistent_workers: false
+  dataloader_pin_memory: true
+  dataloader_prefetch_factor: 2
+  ddp_find_unused_parameters: true
+  debug: false
+  do_eval: false
+  do_predict: false
+  do_train: true
+  early_stopping_patience: 20
+  eval_batch_size: 1
+  eval_epoch_interval: 20
+  gradient_accumulation_steps: 1
+  greater_is_better: true
+  learning_rate: 0.001
+  lr_scheduler_type: constant_schedule_with_warmup
+  max_grad_norm: 5
+  max_steps: 0
+  metric_for_best_model: OVRL
+  num_train_epochs: 2000
+  optim: adamw
+  output_dir: exp/bmi__bmi__fix-ddp-sampler__warmup-2000__fixed-length__rerun
+  per_device_train_batch_size: 8
+  plot_lr: false
+  resume_from_checkpoint: "no"
+  save_epoch_interval: 20
+  save_total_limit: 100
+  seed: 20220815
+  warmup_ratio: 0.0
+  warmup_steps: 2000
+predict_dataset:
+  enroll_folder: /data/xhao/clarity-ICASSP2023/clarity_CEC2_data/clarity_data/dev/speaker_adapt
+  normalize: true
+  scenes_file_fpath: /data/xhao/clarity-ICASSP2023/clarity_CEC2_data/clarity_data/metadata/scenes.dev.json
+  scenes_folder: /data/xhao/clarity-ICASSP2023/clarity_CEC2_data/clarity_data/dev/scenes/
+  scenes_listeners_file: /data/xhao/clarity-ICASSP2023/clarity_CEC2_data/clarity_data/metadata/scenes_listeners.dev.json
+  small_test: false
+train_dataset:
+  enroll_folder: /data/xhao/clarity-ICASSP2023/clarity_CEC2_data/clarity_data/train/targets
+  enroll_len_limit: 10
+  limit: -1
+  normalize: false
+  sample_len_limit: 4
+  scenes_file_fpath: /data/xhao/clarity-ICASSP2023/clarity_CEC2_data/clarity_data/metadata/scenes.train.json
+  scenes_folder: /data/xhao/clarity-ICASSP2023/clarity_CEC2_data/clarity_data/train/scenes/
+  sr: 44100
+  use_all_enroll: false
+  use_additional_data: true
+eval_dev_dataset:
+  scenes_listeners_file: /data/xhao/clarity-ICASSP2023/clarity_CEC2_data/clarity_data/metadata/scenes_listeners.dev.json
+  scenes_folder: /data/xhao/clarity-ICASSP2023/clarity_CEC2_data/clarity_data/dev/scenes/
+  enroll_folder: /data/xhao/clarity-ICASSP2023/clarity_CEC2_data/clarity_data/dev/speaker_adapt
+  limit: 500
+  normalize: false
+  scenes_file_fpath: /data/xhao/clarity-ICASSP2023/clarity_CEC2_data/clarity_data/metadata/scenes.dev.json

exp/bmi__fa-codec/2024_05_20--16_21_26.log ADDED Viewed

	@@ -0,0 +1,4 @@

+05-20 16:21:26 INFO     [logger.py:80]: Initialized logger with log file in exp/bmi__fa-codec.
+05-20 16:21:26 INFO     [logger.py:80]: Initialized logger with log file in exp/bmi__fa-codec.
+05-20 16:21:26 INFO     [logger.py:80]: Initialized logger with log file in exp/bmi__fa-codec.
+05-20 16:21:26 INFO     [logger.py:80]: Initialized logger with log file in exp/bmi__fa-codec.

exp/bmi__fa-codec/2024_05_20--16_22_35.log ADDED Viewed

	@@ -0,0 +1,110 @@

+05-20 16:22:35 INFO     [logger.py:80]: Initialized logger with log file in exp/bmi__fa-codec.
+05-20 16:22:35 INFO     [logger.py:80]: Initialized logger with log file in exp/bmi__fa-codec.
+05-20 16:22:39 INFO     [logging.py:61]:
+Environment information:
+- `Accelerate` version: 0.28.0
+- Platform: Linux-6.1.0-18-amd64-x86_64-with-glibc2.36
+- Python version: 3.10.13
+- Numpy version: 1.26.4
+- PyTorch version (GPU?): 2.2.2 (True)
+- System RAM: 503.49 GB
+- GPU Available: True
+- GPU IDs: 8
+- GPU type: NVIDIA RTX A6000
+05-20 16:22:39 INFO     [logging.py:61]:
+===============================================================================================
+Layer (type:depth-idx)                                                 Param #
+===============================================================================================
+Model                                                                  --
+├─Linear: 1-1                                                          8,224
+├─FACodecEncoder: 1-2                                                  --
+│    └─Sequential: 2-1                                                 --
+│    │    └─Conv1d: 3-1                                                (288)
+│    │    └─EncoderBlock: 3-2                                          (33,728)
+│    │    └─EncoderBlock: 3-3                                          (165,760)
+│    │    └─EncoderBlock: 3-4                                          (724,736)
+│    │    └─EncoderBlock: 3-5                                          (2,891,264)
+│    │    └─Activation1d: 3-6                                          (1,024)
+│    │    └─Conv1d: 3-7                                                (393,728)
+├─FACodecDecoder: 1-3                                                  --
+│    └─ModuleList: 2-2                                                 --
+│    │    └─ResidualVQ: 3-8                                            (12,816)
+│    │    └─ResidualVQ: 3-9                                            (25,632)
+│    │    └─ResidualVQ: 3-10                                           (38,448)
+│    └─Sequential: 2-3                                                 --
+│    │    └─Conv1d: 3-11                                               (1,837,056)
+│    │    └─DecoderBlock: 3-12                                         (11,550,208)
+│    │    └─DecoderBlock: 3-13                                         (2,891,520)
+│    │    └─DecoderBlock: 3-14                                         (659,328)
+│    │    └─DecoderBlock: 3-15                                         (133,056)
+│    │    └─Activation1d: 3-16                                         (128)
+│    │    └─Conv1d: 3-17                                               (450)
+│    │    └─Tanh: 3-18                                                 --
+│    └─TransformerEncoder: 2-4                                         --
+│    │    └─PositionalEncoding: 3-19                                   --
+│    │    └─ModuleList: 3-20                                           (7,353,344)
+│    │    └─LayerNorm: 3-21                                            (512)
+│    └─Linear: 2-5                                                     (131,584)
+│    └─LayerNorm: 2-6                                                  --
+│    └─CNNLSTM: 2-7                                                    --
+│    │    └─Sequential: 3-22                                           (1,579,520)
+│    │    └─ModuleList: 3-23                                           (514)
+│    └─CNNLSTM: 2-8                                                    --
+│    │    └─Sequential: 3-24                                           (1,579,520)
+│    │    └─ModuleList: 3-25                                           (1,285,771)
+│    └─Sequential: 2-9                                                 --
+│    │    └─GradientReversal: 3-26                                     --
+│    │    └─CNNLSTM: 3-27                                              (1,580,034)
+│    └─Sequential: 2-10                                                --
+│    │    └─GradientReversal: 3-28                                     --
+│    │    └─CNNLSTM: 3-29                                              (2,865,291)
+│    └─Sequential: 2-11                                                --
+│    │    └─GradientReversal: 3-30                                     --
+│    │    └─CNNLSTM: 3-31                                              (64,595,920)
+├─ERB: 1-4                                                             --
+│    └─Linear: 2-12                                                    (3,728)
+│    └─Linear: 2-13                                                    (3,728)
+├─SubbandFeatureExtractor: 1-5                                         --
+│    └─Unfold: 2-14                                                    --
+├─Sequential: 1-6                                                      --
+│    └─GroupNorm: 2-15                                                 36
+│    └─Conv1d: 2-16                                                    608
+├─ModuleList: 1-7                                                      --
+│    └─TriplePathRNN: 2-17                                             --
+│    │    └─ResRNN: 3-32                                               54,368
+│    │    └─ResRNN: 3-33                                               54,368
+│    │    └─Linear: 3-34                                               2,080
+│    └─TriplePathRNN: 2-18                                             --
+│    │    └─ResRNN: 3-35                                               54,368
+│    │    └─ResRNN: 3-36                                               54,368
+│    │    └─Linear: 3-37                                               2,080
+│    └─TriplePathRNN: 2-19                                             --
+│    │    └─ResRNN: 3-38                                               54,368
+│    │    └─ResRNN: 3-39                                               54,368
+│    │    └─Linear: 3-40                                               2,080
+├─Sequential: 1-8                                                      --
+│    └─GroupNorm: 2-20                                                 64
+│    └─Conv1d: 2-21                                                    2,112
+│    └─Tanh: 2-22                                                      --
+│    └─Conv1d: 2-23                                                    4,160
+│    └─Tanh: 2-24                                                      --
+│    └─Conv1d: 2-25                                                    260
+===============================================================================================
+Total params: 102,686,548
+Trainable params: 347,912
+Non-trainable params: 102,338,636
+===============================================================================================
+05-20 16:22:40 INFO     [logging.py:61]: warmup_steps=2000. warmup_ratio will be ignored.
+05-20 16:22:41 INFO     [logging.py:61]: Will start from scratch (no checkpoint will be loaded).
+05-20 16:22:41 INFO     [logging.py:61]: ***** Running training *****
+05-20 16:22:41 INFO     [logging.py:61]:   Num Epochs = 2,000
+05-20 16:22:41 INFO     [logging.py:61]:   `steps_per_epoch` = 125
+05-20 16:22:41 INFO     [logging.py:61]:   Instantaneous batch size per device = 16
+05-20 16:22:41 INFO     [logging.py:61]:   Gradient Accumulation steps = 1
+05-20 16:22:41 INFO     [logging.py:61]:   Total optimization steps = 250,000
+05-20 16:22:41 INFO     [logging.py:61]: ========= Epoch 1 out of 2000 =========
+05-20 16:22:41 INFO     [logging.py:61]: Begin training...
+05-20 16:23:34 INFO     [logging.py:61]: Loss 'loss' on epoch 1: 0.28071990609169006
+05-20 16:23:34 INFO     [logging.py:61]: Loss 'norm_before' on epoch 1: 0.033199019730091095
+05-20 16:23:34 INFO     [logging.py:61]: ========= Epoch 2 out of 2000 =========
+05-20 16:23:34 INFO     [logging.py:61]: Begin training...

exp/bmi__fa-codec/2024_05_20--16_24_01.log ADDED Viewed

	@@ -0,0 +1,4 @@

+05-20 16:24:01 INFO     [logger.py:80]: Initialized logger with log file in exp/bmi__fa-codec.
+05-20 16:24:01 INFO     [logger.py:80]: Initialized logger with log file in exp/bmi__fa-codec.
+05-20 16:24:01 INFO     [logger.py:80]: Initialized logger with log file in exp/bmi__fa-codec.
+05-20 16:24:01 INFO     [logger.py:80]: Initialized logger with log file in exp/bmi__fa-codec.

exp/bmi__fa-codec/amplified_signals/S06021_L0014_HA-output.wav ADDED Viewed

Binary file (554 kB). View file

exp/bmi__fa-codec/amplified_signals/S06026_L0088_HA-output.wav ADDED Viewed

Binary file (449 kB). View file

exp/bmi__fa-codec/amplified_signals/S06031_L0096_HA-output.wav ADDED Viewed

Binary file (478 kB). View file

exp/bmi__fa-codec/amplified_signals/S06036_L0036_HA-output.wav ADDED Viewed

Binary file (416 kB). View file

exp/bmi__fa-codec/amplified_signals/S06066_L0042_HA-output.wav ADDED Viewed

Binary file (535 kB). View file

exp/bmi__fa-codec/amplified_signals/S06071_L0089_HA-output.wav ADDED Viewed

Binary file (528 kB). View file

exp/bmi__fa-codec/amplified_signals/S06086_L0072_HA-output.wav ADDED Viewed

Binary file (442 kB). View file

exp/bmi__fa-codec/amplified_signals/S06091_L0099_HA-output.wav ADDED Viewed

Binary file (623 kB). View file

exp/bmi__fa-codec/amplified_signals/S06101_L0042_HA-output.wav ADDED Viewed

Binary file (484 kB). View file

exp/bmi__fa-codec/amplified_signals/S06111_L0002_HA-output.wav ADDED Viewed

Binary file (481 kB). View file

exp/bmi__fa-codec/amplified_signals/S06116_L0092_HA-output.wav ADDED Viewed

Binary file (643 kB). View file

exp/bmi__fa-codec/amplified_signals/S06126_L0069_HA-output.wav ADDED Viewed

Binary file (522 kB). View file

exp/bmi__fa-codec/amplified_signals/S06146_L0017_HA-output.wav ADDED Viewed

Binary file (522 kB). View file