add real chunk-wise streaming inference code

Browse files

Files changed (3) hide show

inference_600m_streaming_forward.py +96 -37
inference_600m_streaming_forward.sh +14 -0
zipformer.py +17 -10

inference_600m_streaming_forward.py CHANGED Viewed

@@ -1,14 +1,14 @@
 import argparse
 import math
-from typing import Dict, List, Optional, Tuple
 from model import MultiKDModel
 from scaling import ScheduledFloat
 from subsampling import Conv2dSubsampling
 from zipformer import Zipformer2
-from lhotse import Fbank, FbankConfig
 import torchaudio
 import torch
 from torch import Tensor
 import torch.nn as nn
@@ -311,6 +311,44 @@ def unstack_states(batch_states: List[Tensor]) -> List[List[Tensor]]:
     return state_list
 def streaming_forward(
     features: Tensor,
     feature_lens: Tensor,
@@ -318,7 +356,7 @@ def streaming_forward(
     states: List[Tensor],
     chunk_size: int,
     left_context_len: int,
-) -> Tuple[Tensor, Tensor, List[Tensor]]:
     """
     Returns encoder outputs, output lengths, and updated states.
     """
@@ -351,6 +389,7 @@ def streaming_forward(
         encoder_out,
         encoder_out_lens,
         new_encoder_states,
     ) = model.encoder.streaming_forward(
         x=x,
         x_lens=x_lens,
@@ -358,12 +397,13 @@ def streaming_forward(
         src_key_padding_mask=src_key_padding_mask,
     )
     encoder_out = encoder_out.permute(1, 0, 2)  # (T, N, C) ->(N, T, C)
     new_states = new_encoder_states + [
         new_cached_embed_left_pad,
         new_processed_lens,
     ]
-    return encoder_out, encoder_out_lens, new_states
 def chunk_forward(
     audio: torch.Tensor,
@@ -373,36 +413,47 @@ def chunk_forward(
     left_context_frames: int = 256,
 ):
     # Perform chunk by chunk forward for the encoder. Each chunk is conditioned on the current chunk and left context (maintained by the states)
-    # At each step, we take a chunk of audio and forward the encoder
-    # For the first chunk, we wait until the accumulated audio duration to reach (buffer + chunk_duration), the buffer
-    # is necessary for the convolution subsampling module in the encoder.
     # After the first chunk, we perform normal chunk-by-chunk inference when the accumulated audio reaches chunk_duration
     # An example of Buffer=2 frames, chunk=5 frames, the latency for the first chunk is 7 frames (as we need to accumulate 7 frames
     # for decoding), the rest chunks have latency of 5 frames.
-    # Each time we feed (5 + 2) frames to the encoder, and then shift 5 frames
     # Chunk 1: AAAAAAA
     # Chunk 2:      AAAAAAA
     # Chunk 3:           AAAAAAA
-    # NOTE: params.chunk_size is the chunk_size regarding to the input of the zipformer encoder, so at fbank level, the chunk size
-    # is 2 * params.chunk_size
-    # fbank extractor
-    extractor = Fbank(FbankConfig(num_mel_bins=feature_dim))
     device = next(model.parameters()).device
     chunk_size = int(chunk_size)
     chunk_size_samples = int(chunk_size * 2 * 160) # chunk size represented in audio samples of 16kHz sampling rate
     left_context_len = int(left_context_frames)
-    pad_length = 7 + 2 * 3 # buffer required by encoder_embed module (i.e. convolution subsampling)
-    pad_length_samples = (7 + 2 * 3) * 160
-    # intialize states, to be maintained during chunk-wise forward
-    initial_states = get_init_states(model=model, batch_size=1, device=device)
-    # start forward chunk by chunk
     encoder_outs = []
     encoder_out_lens = 0
     states = initial_states
@@ -411,21 +462,24 @@ def chunk_forward(
     # the actual loop performing the chunk-wise inference of the encoder
     while True:
-        # prepare the input for processing current chunk
-        # compute fbank for the current chunk
-        audio_chunk = audio[:, num_processed_samples: num_processed_samples + (chunk_size_samples + pad_length_samples)]
-        features = extractor.extract(audio_chunk, sampling_rate=16000)
         features = features.to(device)
         feature_lens = features.shape[0]
-        feature_lens = torch.tensor([feature_lens], device=device) # shape: (1)
-        features = features.unsqueeze(0) # shape: (1,T,num_mels)
         # the audio chunk could be shorter than the expected length, for example in the last two chunks
-        # pad the chunk so that the input shape is (chunk_size + buffer)
-        tail_length = chunk_size * 2 + 7 + 2 * 3 # each prepared chunk should have this length
-        if features.size(1) < tail_length:
-            pad_length = tail_length - features.size(1)
             feature_lens += pad_length
             features = torch.nn.functional.pad(
                 features,
@@ -437,7 +491,7 @@ def chunk_forward(
         states = stack_states([states])
         # forward current chunk in batch=1
-        encoder_out, encoder_out_len, new_states = streaming_forward(
             features=features,
             feature_lens=feature_lens,
             model=model,
@@ -447,22 +501,26 @@ def chunk_forward(
         )
         encoder_outs.append(encoder_out)
         encoder_out_lens += encoder_out_len
         # update the states
         states = unstack_states(new_states)[0]
         num_chunk += 1
-        num_processed_samples += chunk_size_samples
         if num_processed_samples > audio.shape[1]:
             print(f"Audio is exhausted.")
             break
     encoder_outs = torch.cat(encoder_outs, dim=1) # shape: (1,T,C)
-    return encoder_outs, encoder_out_lens
 def main(args):
@@ -484,18 +542,19 @@ def main(args):
     audio, fs = torchaudio.load(args.audio)
     assert fs == 16000
-    encoder_out, encoder_out_lens = chunk_forward(
         audio=audio, # shape (1, num_samples)
         model=model,
         feature_dim=128,
         chunk_size=args.chunk_size,
         left_context_frames=args.left_context_frames,
     )
     print(encoder_out)
     print(encoder_out.shape)
-    # torch.save(encoder_out, "streaming_forward_encoder_out_no_k2.pt")
 if __name__=="__main__":
     parser = get_parser()

 import argparse
 import math
+from typing import List, Tuple
 from model import MultiKDModel
 from scaling import ScheduledFloat
 from subsampling import Conv2dSubsampling
 from zipformer import Zipformer2
 import torchaudio
+from torchaudio.compliance.kaldi import fbank
 import torch
 from torch import Tensor
 import torch.nn as nn
     return state_list
+def compute_fbank(
+    wavs: torch.Tensor, wav_lens: torch.Tensor
+):
+    """Compute fbank features
+    Args:
+        wavs (torch.Tensor): the mono-channel input waveform, (N, T)
+        wav_lens (torch.Tensor): the length of each waveform in samples (N)
+    Returns:
+        The fbank features, and their lengths
+    """
+    assert wavs.ndim == 2, wavs.shape
+    low_freq = 20.0
+    high_freq=-400.0
+    dither=0.0
+    snip_egdes=False
+    features = []
+    for i, wav in enumerate(wavs):
+        feat = fbank(
+            wav[:wav_lens[i]].unsqueeze(0),
+            sample_frequency=16000, # this is fixed to 16000
+            num_mel_bins=128,
+            low_freq=low_freq,
+            snip_edges=snip_egdes,
+            high_freq=high_freq,
+            dither=dither,
+            energy_floor=1.0e-10,
+        )
+        features.append(feat)
+    feat_len = torch.tensor([f.shape[0] for f in features]).to(wavs.device)
+    features = torch.nn.utils.rnn.pad_sequence(
+        features, batch_first=True, padding_value=LOG_EPS
+    ).to(wavs.device)
+    return features, feat_len
 def streaming_forward(
     features: Tensor,
     feature_lens: Tensor,
     states: List[Tensor],
     chunk_size: int,
     left_context_len: int,
+) -> Tuple[Tensor, Tensor, List[Tensor], List[Tensor]]:
     """
     Returns encoder outputs, output lengths, and updated states.
     """
         encoder_out,
         encoder_out_lens,
         new_encoder_states,
+        middle_outs,
     ) = model.encoder.streaming_forward(
         x=x,
         x_lens=x_lens,
         src_key_padding_mask=src_key_padding_mask,
     )
     encoder_out = encoder_out.permute(1, 0, 2)  # (T, N, C) ->(N, T, C)
+    middle_outs = [m.permute(1, 0, 2) for m in middle_outs] # (T, N, C) ->(N, T, C)
     new_states = new_encoder_states + [
         new_cached_embed_left_pad,
         new_processed_lens,
     ]
+    return encoder_out, encoder_out_lens, new_states, middle_outs
 def chunk_forward(
     audio: torch.Tensor,
     left_context_frames: int = 256,
 ):
     # Perform chunk by chunk forward for the encoder. Each chunk is conditioned on the current chunk and left context (maintained by the states)
+    # At each step, we take a chunk of audio and forward the encoder.
+    # For the first chunk, we wait until the accumulated audio duration to reach (chunk_duration + buffer), the buffer
+    # is necessary for the convolution subsampling modules in the encoder to produce accurate output.
+    # The buffer consists of two parts:
+    #     1. Some trailing fbank frames, covered by the convolution kernels in the encoder_embed
+    #     2. Some extra tolerance frames, to give precise last fbank frame (the tolerance fbank frame will be removed)
     # After the first chunk, we perform normal chunk-by-chunk inference when the accumulated audio reaches chunk_duration
     # An example of Buffer=2 frames, chunk=5 frames, the latency for the first chunk is 7 frames (as we need to accumulate 7 frames
     # for decoding), the rest chunks have latency of 5 frames.
+    # Each time we feed (5 + 2) frames to the encoder, and then shift 5 frames
     # Chunk 1: AAAAAAA
     # Chunk 2:      AAAAAAA
     # Chunk 3:           AAAAAAA
+    # NOTE: chunk_size is the chunk_size regarding to the input of the zipformer encoder, so at fbank level, the chunk size
+    # is 2 * chunk_size
     device = next(model.parameters()).device
     chunk_size = int(chunk_size)
     chunk_size_samples = int(chunk_size * 2 * 160) # chunk size represented in audio samples of 16kHz sampling rate
     left_context_len = int(left_context_frames)
+    # Buffer-related
+    # 1. extra frames required by encoder_embed module (i.e. convolution subsampling)
+    pad_length = 7 + 2 * 3 #
+    pad_length_samples = (7 + 2 * 3) * 160 # in samples
+    extra_tolerance = 0.01 # 10 ms
+    extra_tolerance_samples = int(extra_tolerance * 16000)
+    buffer_samples = pad_length_samples + extra_tolerance_samples
+    chunk_size_with_pad = chunk_size * 2 + 7 + 2 * 3 # This is the total number of fbank frames we need to compute for each chunk forward
+    # intializations, to be maintained during chunk-wise forward
+    initial_states = get_init_states(model=model, batch_size=1, device=device)
     encoder_outs = []
+    middle_outs = []
     encoder_out_lens = 0
     states = initial_states
     # the actual loop performing the chunk-wise inference of the encoder
     while True:
+        # Get the audio samples
+        audio_chunk = audio[
+            :,
+            num_processed_samples: num_processed_samples + (chunk_size_samples + buffer_samples)
+        ] # (1, num_samples)
+        # compute the fbank features for the current chunk
+        features, _  = compute_fbank(audio_chunk, torch.tensor([audio_chunk.shape[-1]])) # shape: (T, num_mels)
+        features = features[:, :chunk_size_with_pad, :] # only keep the required fbank frames for current chunk
         features = features.to(device)
         feature_lens = features.shape[0]
+        feature_lens = torch.tensor([features.shape[1]], device=device) # shape: (1)
         # the audio chunk could be shorter than the expected length, for example in the last two chunks
+        # so we need to pad the chunk to the expected length
+        if features.size(1) < chunk_size_with_pad:
+            pad_length = chunk_size_with_pad - features.size(1)
             feature_lens += pad_length
             features = torch.nn.functional.pad(
                 features,
         states = stack_states([states])
         # forward current chunk in batch=1
+        encoder_out, encoder_out_len, new_states, middle_out = streaming_forward(
             features=features,
             feature_lens=feature_lens,
             model=model,
         )
         encoder_outs.append(encoder_out)
+        middle_outs.append(middle_out)
         encoder_out_lens += encoder_out_len
         # update the states
         states = unstack_states(new_states)[0]
         num_chunk += 1
+        num_processed_samples += chunk_size_samples # move one chunk forward
         if num_processed_samples > audio.shape[1]:
             print(f"Audio is exhausted.")
             break
     encoder_outs = torch.cat(encoder_outs, dim=1) # shape: (1,T,C)
+    layerwise_outs = []
+    for i in range(len(middle_outs[0])): # for each intermediate layer
+        layerwise_outs.append(torch.cat([m[i] for m in middle_outs], dim=1)) # shape: (1,T,C)
+    return encoder_outs, encoder_out_lens, layerwise_outs
 def main(args):
     audio, fs = torchaudio.load(args.audio)
     assert fs == 16000
+    encoder_out, encoder_out_lens, intermediate_hidden_states = chunk_forward(
         audio=audio, # shape (1, num_samples)
         model=model,
         feature_dim=128,
         chunk_size=args.chunk_size,
         left_context_frames=args.left_context_frames,
     )
     print(encoder_out)
     print(encoder_out.shape)
+    print(intermediate_hidden_states[-1])
+    print(intermediate_hidden_states[-1].shape)
 if __name__=="__main__":
     parser = get_parser()

inference_600m_streaming_forward.sh ADDED Viewed

	@@ -0,0 +1,14 @@

+#!/usr/bin/env bash
+model_version=600m_uniform_out_ds1
+causal=1
+left_context_frames=128
+chunk_size=8
+python inference_600m_streaming_forward.py \
+    --model-version $model_version \
+    --ckpt-path v0.2/iter-500000-avg-4.pt \
+    --causal $causal \
+    --left-context-frames $left_context_frames \
+    --chunk-size $chunk_size \
+    --audio 1284-1180-0027.flac

zipformer.py CHANGED Viewed

@@ -434,6 +434,7 @@ class Zipformer2(nn.Module):
         x_lens: Tensor,
         states: List[Tensor],
         src_key_padding_mask: Tensor,
     ) -> Tuple[Tensor, Tensor, List[Tensor]]:
         """
         Args:
@@ -456,6 +457,7 @@ class Zipformer2(nn.Module):
             - updated states
         """
         outputs = []
         new_states = []
         layer_offset = 0
@@ -464,14 +466,16 @@ class Zipformer2(nn.Module):
             ds = self.downsampling_factor[i]
             x = convert_num_channels(x, self.encoder_dim[i])
-            x, new_layer_states = module.streaming_forward(
                 x,
                 states=states[layer_offset * 6 : (layer_offset + num_layers) * 6],
                 left_context_len=self.left_context_frames[0] // ds,
                 src_key_padding_mask=src_key_padding_mask[..., ::ds],
             )
             layer_offset += num_layers
             outputs.append(x)
             new_states += new_layer_states
         # if the last output has the largest dimension, x will be unchanged,
@@ -479,17 +483,20 @@ class Zipformer2(nn.Module):
         # from different pieces of 'outputs', taking each dimension from the
         # most recent output that has it present.
         x = self._get_full_dim_output(outputs)
-        x = self.downsample_output(x)
-        # class Downsample has this rounding behavior..
-        assert self.output_downsampling_factor == 2
-        if torch.jit.is_scripting() or torch.jit.is_tracing():
-            lengths = (x_lens + 1) // 2
-        else:
-            with warnings.catch_warnings():
-                warnings.simplefilter("ignore")
                 lengths = (x_lens + 1) // 2
-        return x, lengths, new_states
     @torch.jit.export
     def get_init_states(

         x_lens: Tensor,
         states: List[Tensor],
         src_key_padding_mask: Tensor,
+        return_middle_out: bool = True,
     ) -> Tuple[Tensor, Tensor, List[Tensor]]:
         """
         Args:
             - updated states
         """
         outputs = []
+        middle_outputs = []
         new_states = []
         layer_offset = 0
             ds = self.downsampling_factor[i]
             x = convert_num_channels(x, self.encoder_dim[i])
+            x, new_layer_states, cur_middle_out = module.streaming_forward(
                 x,
                 states=states[layer_offset * 6 : (layer_offset + num_layers) * 6],
                 left_context_len=self.left_context_frames[0] // ds,
                 src_key_padding_mask=src_key_padding_mask[..., ::ds],
+                return_middle_out=return_middle_out,
             )
             layer_offset += num_layers
             outputs.append(x)
+            middle_outputs += cur_middle_out
             new_states += new_layer_states
         # if the last output has the largest dimension, x will be unchanged,
         # from different pieces of 'outputs', taking each dimension from the
         # most recent output that has it present.
         x = self._get_full_dim_output(outputs)
+        if self.output_downsampling_factor >= 2:
+            x = self.downsample_output(x)
+            # class Downsample has this rounding behavior..
+            assert self.output_downsampling_factor == 2
+            if torch.jit.is_scripting() or torch.jit.is_tracing():
                 lengths = (x_lens + 1) // 2
+            else:
+                with warnings.catch_warnings():
+                    warnings.simplefilter("ignore")
+                    lengths = (x_lens + 1) // 2
+        else:
+            lengths = x_lens
+        return x, lengths, new_states, middle_outputs
     @torch.jit.export
     def get_init_states(