yuekai
/

aishell1_tlg_essentials

Model card Files Files and versions

xet

Community

Yuekai Zhang commited on Apr 4, 2023

Commit

57bf40b

1 Parent(s): 55f6a13

add blank skip

Browse files

Files changed (2) hide show

test/test_frame_reducer.py +191 -0
test/test_riva_wfst_decoder.py +13 -6

test/test_frame_reducer.py ADDED Viewed

	@@ -0,0 +1,191 @@

+#!/usr/bin/env python3
+#
+# Copyright      2022  Xiaomi Corp.        (authors: Yifan   Yang,
+#                                                    Zengwei Yao,
+#                                                    Wei     Kang)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from typing import Optional, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+def make_pad_mask(lengths: torch.Tensor, max_len: int = 0) -> torch.Tensor:
+    """
+    Args:
+      lengths:
+        A 1-D tensor containing sentence lengths.
+      max_len:
+        The length of masks.
+    Returns:
+      Return a 2-D bool tensor, where masked positions
+      are filled with `True` and non-masked positions are
+      filled with `False`.
+    >>> lengths = torch.tensor([1, 3, 2, 5])
+    >>> make_pad_mask(lengths)
+    tensor([[False,  True,  True,  True,  True],
+            [False, False, False,  True,  True],
+            [False, False,  True,  True,  True],
+            [False, False, False, False, False]])
+    """
+    assert lengths.ndim == 1, lengths.ndim
+    max_len = max(max_len, lengths.max())
+    n = lengths.size(0)
+    seq_range = torch.arange(0, max_len, device=lengths.device)
+    expaned_lengths = seq_range.unsqueeze(0).expand(n, max_len)
+    return expaned_lengths >= lengths.unsqueeze(-1)
+class FrameReducer(nn.Module):
+    """The encoder output is first used to calculate
+    the CTC posterior probability; then for each output frame,
+    if its blank posterior is bigger than some thresholds,
+    it will be simply discarded from the encoder output.
+    """
+    def __init__(
+        self,
+    ):
+        super().__init__()
+    def forward(
+        self,
+        x: torch.Tensor,
+        x_lens: torch.Tensor,
+        ctc_output: torch.Tensor,
+        y_lens: Optional[torch.Tensor] = None,
+        blank_id: int = 0,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Args:
+            x:
+              The shared encoder output with shape [N, T, C].
+            x_lens:
+              A tensor of shape (batch_size,) containing the number of frames in
+              `x` before padding.
+            ctc_output:
+              The CTC output with shape [N, T, vocab_size].
+            y_lens:
+              A tensor of shape (batch_size,) containing the number of frames in
+              `y` before padding.
+            blank_id:
+              The blank id of ctc_output.
+        Returns:
+            out:
+              The frame reduced encoder output with shape [N, T', C].
+            out_lens:
+              A tensor of shape (batch_size,) containing the number of frames in
+              `out` before padding.
+        """
+        N, T, C = x.size()
+        padding_mask = make_pad_mask(x_lens)
+        non_blank_mask = (ctc_output[:, :, blank_id] < math.log(0.9)) * (~padding_mask)
+        if y_lens is not None:
+            # Limit the maximum number of reduced frames
+            limit_lens = T - y_lens
+            max_limit_len = limit_lens.max().int()
+            fake_limit_indexes = torch.topk(
+                ctc_output[:, :, blank_id], max_limit_len
+            ).indices
+            T = (
+                torch.arange(max_limit_len)
+                .expand_as(
+                    fake_limit_indexes,
+                )
+                .to(device=x.device)
+            )
+            T = torch.remainder(T, limit_lens.unsqueeze(1))
+            limit_indexes = torch.gather(fake_limit_indexes, 1, T)
+            limit_mask = torch.full_like(
+                non_blank_mask,
+                False,
+                device=x.device,
+            ).scatter_(1, limit_indexes, True)
+            non_blank_mask = non_blank_mask | ~limit_mask
+        out_lens = non_blank_mask.sum(dim=1)
+        max_len = out_lens.max()
+        pad_lens_list = (
+            torch.full_like(
+                out_lens,
+                max_len.item(),
+                device=x.device,
+            )
+            - out_lens
+        )
+        max_pad_len = pad_lens_list.max()
+        out = F.pad(x, (0, 0, 0, max_pad_len))
+        valid_pad_mask = ~make_pad_mask(pad_lens_list)
+        total_valid_mask = torch.concat([non_blank_mask, valid_pad_mask], dim=1)
+        out = out[total_valid_mask].reshape(N, -1, C)
+        return out, out_lens
+if __name__ == "__main__":
+    import time
+    test_times = 10000
+    device = "cuda:0"
+    frame_reducer = FrameReducer()
+    # non zero case
+    x = torch.ones(15, 498, 384, dtype=torch.float32, device=device)
+    x_lens = torch.tensor([498] * 15, dtype=torch.int64, device=device)
+    y_lens = torch.tensor([150] * 15, dtype=torch.int64, device=device)
+    ctc_output = torch.log(
+        torch.randn(15, 498, 500, dtype=torch.float32, device=device),
+    )
+    avg_time = 0
+    for i in range(test_times):
+        torch.cuda.synchronize(device=x.device)
+        delta_time = time.time()
+        x_fr, x_lens_fr = frame_reducer(x, x_lens, ctc_output, y_lens)
+        torch.cuda.synchronize(device=x.device)
+        delta_time = time.time() - delta_time
+        avg_time += delta_time
+    print(x_fr.shape)
+    print(x_lens_fr)
+    print(avg_time / test_times)
+    # all zero case
+    x = torch.zeros(15, 498, 384, dtype=torch.float32, device=device)
+    x_lens = torch.tensor([498] * 15, dtype=torch.int64, device=device)
+    y_lens = torch.tensor([150] * 15, dtype=torch.int64, device=device)
+    ctc_output = torch.zeros(15, 498, 500, dtype=torch.float32, device=device)
+    avg_time = 0
+    for i in range(test_times):
+        torch.cuda.synchronize(device=x.device)
+        delta_time = time.time()
+        x_fr, x_lens_fr = frame_reducer(x, x_lens, ctc_output, y_lens)
+        torch.cuda.synchronize(device=x.device)
+        delta_time = time.time() - delta_time
+        avg_time += delta_time
+    print(x_fr.shape)
+    print(x_lens_fr)
+    print(avg_time / test_times)

test/test_riva_wfst_decoder.py CHANGED Viewed

@@ -4,6 +4,7 @@ import torch
 import os
 from riva.asrlib.decoder.python_decoder import BatchedMappedDecoderCuda, BatchedMappedDecoderCudaConfig
 from typing import List
 def remove_duplicates_and_blank(hyp: List[int],
                                 eos: int,
@@ -28,11 +29,14 @@ class RivaWFSTDecoder:
         config.online_opts.decoder_opts.default_beam = 17.0
         config.online_opts.decoder_opts.max_active = 7000
         config.online_opts.determinize_lattice = True
-        config.online_opts.max_batch_size = 800
-        config.online_opts.num_channels = 800
         config.online_opts.frame_shift_seconds = 0.04
         config.online_opts.lattice_postprocessor_opts.lm_scale = 5.0
         config.online_opts.lattice_postprocessor_opts.word_ins_penalty = 0.0
         config.online_opts.lattice_postprocessor_opts.nbest = beam_size
@@ -87,8 +91,8 @@ if __name__ == "__main__":
     char_dict = load_word_symbols('./data/words.txt')
     beam_size = 10
-    batch_size = 50
-    counts = 1
     # ctc_log_probs [1,103,4233]
     ctc_log_probs = torch.from_numpy(data['ctc_log_probs'])
@@ -97,6 +101,9 @@ if __name__ == "__main__":
     encoder_out_lens = torch.from_numpy(data['encoder_out_lens'])   # encoder_out_lens single element 103
     encoder_out_lens = encoder_out_lens.repeat(batch_size)          # [batch_size]
     ctc_log_probs = ctc_log_probs.contiguous().cuda()
     vocab_size = ctc_log_probs.shape[2]
     riva_decoder = RivaWFSTDecoder(vocab_size, lang_dir, beam_size)
@@ -106,7 +113,7 @@ if __name__ == "__main__":
         print("ctc_log_probs.shape:", ctc_log_probs.shape)
         total_hyps = riva_decoder.decode_mbr(ctc_log_probs, encoder_out_lens)
         print('mbr', total_hyps)
-        total_hyps = riva_decoder.decode_nbest(ctc_log_probs, encoder_out_lens)
-        print('nbest', total_hyps)
     decode_end = time.perf_counter() - decode_start
     print(f"Decode {ctc_log_probs.shape[0] * counts} sentences, cost {decode_end} seconds")

 import os
 from riva.asrlib.decoder.python_decoder import BatchedMappedDecoderCuda, BatchedMappedDecoderCudaConfig
 from typing import List
+from test_frame_reducer import FrameReducer
 def remove_duplicates_and_blank(hyp: List[int],
                                 eos: int,
         config.online_opts.decoder_opts.default_beam = 17.0
         config.online_opts.decoder_opts.max_active = 7000
         config.online_opts.determinize_lattice = True
+        config.online_opts.max_batch_size = 100
+        config.online_opts.num_channels = 200
         config.online_opts.frame_shift_seconds = 0.04
         config.online_opts.lattice_postprocessor_opts.lm_scale = 5.0
         config.online_opts.lattice_postprocessor_opts.word_ins_penalty = 0.0
+        config.online_opts.decoder_opts.blank_penalty = 0.95
+        config.online_opts.num_post_processing_worker_threads = 16
+        config.online_opts.num_decoder_copy_threads = 4
         config.online_opts.lattice_postprocessor_opts.nbest = beam_size
     char_dict = load_word_symbols('./data/words.txt')
     beam_size = 10
+    batch_size = 1
+    counts = 10
     # ctc_log_probs [1,103,4233]
     ctc_log_probs = torch.from_numpy(data['ctc_log_probs'])
     encoder_out_lens = torch.from_numpy(data['encoder_out_lens'])   # encoder_out_lens single element 103
     encoder_out_lens = encoder_out_lens.repeat(batch_size)          # [batch_size]
     ctc_log_probs = ctc_log_probs.contiguous().cuda()
+    frame_reducer = FrameReducer()
+    ctc_log_probs, encoder_out_len = frame_reducer(ctc_log_probs, encoder_out_lens.cuda(), ctc_log_probs)
     vocab_size = ctc_log_probs.shape[2]
     riva_decoder = RivaWFSTDecoder(vocab_size, lang_dir, beam_size)
         print("ctc_log_probs.shape:", ctc_log_probs.shape)
         total_hyps = riva_decoder.decode_mbr(ctc_log_probs, encoder_out_lens)
         print('mbr', total_hyps)
+        # total_hyps = riva_decoder.decode_nbest(ctc_log_probs, encoder_out_lens)
+        # print('nbest', total_hyps)
     decode_end = time.perf_counter() - decode_start
     print(f"Decode {ctc_log_probs.shape[0] * counts} sentences, cost {decode_end} seconds")