|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| """Utility class for extracting features from the text and audio input."""
|
| from __future__ import absolute_import
|
| from __future__ import division
|
| from __future__ import print_function
|
|
|
| import codecs
|
| import numpy as np
|
|
|
|
|
| def compute_spectrogram_feature(samples, sample_rate, stride_ms=10.0,
|
| window_ms=20.0, max_freq=None, eps=1e-14):
|
| """Compute the spectrograms for the input samples(waveforms).
|
|
|
| More about spectrogram computation, please refer to:
|
| https://en.wikipedia.org/wiki/Short-time_Fourier_transform.
|
| """
|
| if max_freq is None:
|
| max_freq = sample_rate / 2
|
| if max_freq > sample_rate / 2:
|
| raise ValueError("max_freq must not be greater than half of sample rate.")
|
|
|
| if stride_ms > window_ms:
|
| raise ValueError("Stride size must not be greater than window size.")
|
|
|
| stride_size = int(0.001 * sample_rate * stride_ms)
|
| window_size = int(0.001 * sample_rate * window_ms)
|
|
|
|
|
| truncate_size = (len(samples) - window_size) % stride_size
|
| samples = samples[:len(samples) - truncate_size]
|
| nshape = (window_size, (len(samples) - window_size) // stride_size + 1)
|
| nstrides = (samples.strides[0], samples.strides[0] * stride_size)
|
| windows = np.lib.stride_tricks.as_strided(
|
| samples, shape=nshape, strides=nstrides)
|
| assert np.all(
|
| windows[:, 1] == samples[stride_size:(stride_size + window_size)])
|
|
|
|
|
| weighting = np.hanning(window_size)[:, None]
|
| fft = np.fft.rfft(windows * weighting, axis=0)
|
| fft = np.absolute(fft)
|
| fft = fft**2
|
| scale = np.sum(weighting**2) * sample_rate
|
| fft[1:-1, :] *= (2.0 / scale)
|
| fft[(0, -1), :] /= scale
|
|
|
| freqs = float(sample_rate) / window_size * np.arange(fft.shape[0])
|
|
|
|
|
| ind = np.where(freqs <= max_freq)[0][-1] + 1
|
| specgram = np.log(fft[:ind, :] + eps)
|
| return np.transpose(specgram, (1, 0))
|
|
|
|
|
| class AudioFeaturizer(object):
|
| """Class to extract spectrogram features from the audio input."""
|
|
|
| def __init__(self,
|
| sample_rate=16000,
|
| window_ms=20.0,
|
| stride_ms=10.0):
|
| """Initialize the audio featurizer class according to the configs.
|
|
|
| Args:
|
| sample_rate: an integer specifying the sample rate of the input waveform.
|
| window_ms: an integer for the length of a spectrogram frame, in ms.
|
| stride_ms: an integer for the frame stride, in ms.
|
| """
|
| self.sample_rate = sample_rate
|
| self.window_ms = window_ms
|
| self.stride_ms = stride_ms
|
|
|
|
|
| def compute_label_feature(text, token_to_idx):
|
| """Convert string to a list of integers."""
|
| tokens = list(text.strip().lower())
|
| feats = [token_to_idx[token] for token in tokens]
|
| return feats
|
|
|
|
|
| class TextFeaturizer(object):
|
| """Extract text feature based on char-level granularity.
|
|
|
| By looking up the vocabulary table, each input string (one line of transcript)
|
| will be converted to a sequence of integer indexes.
|
| """
|
|
|
| def __init__(self, vocab_file):
|
| lines = []
|
| with codecs.open(vocab_file, "r", "utf-8") as fin:
|
| lines.extend(fin.readlines())
|
| self.token_to_index = {}
|
| self.index_to_token = {}
|
| self.speech_labels = ""
|
| index = 0
|
| for line in lines:
|
| line = line[:-1]
|
| if line.startswith("#"):
|
|
|
| continue
|
| self.token_to_index[line] = index
|
| self.index_to_token[index] = line
|
| self.speech_labels += line
|
| index += 1
|
|
|