| import numpy as np |
| from fairseq.data.audio.feature_transforms import ( |
| AudioFeatureTransform, |
| register_audio_feature_transform, |
| ) |
|
|
|
|
| @register_audio_feature_transform("utterance_cmvn") |
| class UtteranceCMVN(AudioFeatureTransform): |
| """Utterance-level CMVN (cepstral mean and variance normalization)""" |
|
|
| @classmethod |
| def from_config_dict(cls, config=None): |
| _config = {} if config is None else config |
| return UtteranceCMVN( |
| _config.get("norm_means", True), |
| _config.get("norm_vars", True), |
| ) |
|
|
| def __init__(self, norm_means=True, norm_vars=True): |
| self.norm_means, self.norm_vars = norm_means, norm_vars |
|
|
| def __repr__(self): |
| return ( |
| self.__class__.__name__ |
| + f"(norm_means={self.norm_means}, norm_vars={self.norm_vars})" |
| ) |
|
|
| def __call__(self, x): |
| mean = x.mean(axis=0) |
| square_sums = (x ** 2).sum(axis=0) |
|
|
| if self.norm_means: |
| x = np.subtract(x, mean) |
| if self.norm_vars: |
| var = square_sums / x.shape[0] - mean ** 2 |
| std = np.sqrt(np.maximum(var, 1e-10)) |
| x = np.divide(x, std) |
|
|
| return x |
|
|