yuekai
/

aishell1_tlg_essentials

Model card Files Files and versions

xet

Community

Yuekai Zhang commited on Mar 31, 2023

Commit

55f6a13

1 Parent(s): 0c5ca4b

update nbest

Browse files

Files changed (2) hide show

test/data/input2.npz +3 -0
test/test_riva_wfst_decoder.py +61 -20

test/data/input2.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a7831ad64f1fb28bf1adb5e197d43b00aebb430ac171c4a08db891321220fd90
+size 2083188

test/test_riva_wfst_decoder.py CHANGED Viewed

@@ -3,48 +3,88 @@ import time
 import torch
 import os
 from riva.asrlib.decoder.python_decoder import BatchedMappedDecoderCuda, BatchedMappedDecoderCudaConfig
 class RivaWFSTDecoder:
-    def __init__(self, vocab_size, tlg_dir, beam_size=8.0):
         config = BatchedMappedDecoderCudaConfig()
-        config.online_opts.lattice_postprocessor_opts.acoustic_scale=10.0
         config.n_input_per_chunk = 50
         config.online_opts.decoder_opts.default_beam = 17.0
-        config.online_opts.decoder_opts.lattice_beam = beam_size
         config.online_opts.decoder_opts.max_active = 7000
         config.online_opts.determinize_lattice = True
         config.online_opts.max_batch_size = 800
         config.online_opts.num_channels = 800
         config.online_opts.frame_shift_seconds = 0.04
         config.online_opts.lattice_postprocessor_opts.lm_scale = 5.0
         config.online_opts.lattice_postprocessor_opts.word_ins_penalty = 0.0
         self.decoder = BatchedMappedDecoderCuda(
-            config, os.path.join(tlg_dir, "TLG.fst"), os.path.join(tlg_dir, "words.txt"), vocab_size
         )
-    def decode(self, logits, length):
-        padded_sequence = logits.contiguous()
         sequence_lengths_tensor = length.to(torch.long).to('cpu').contiguous()
-        results = self.decoder.decode(padded_sequence, sequence_lengths_tensor)
-        return results
-    def get_nbest_list(self, results, nbest=1):
-        assert nbest == 1, "Only support nbest=1 for now"
         total_hyps = []
         for sent in results:
             hyp = [word[0] for word in sent]
             hyp_zh = "".join(hyp)
-            nbest_list = [hyp_zh] # TODO: add real nbest
-            total_hyps.append(nbest_list)
         return total_hyps
 if __name__ == "__main__":
-    lang_dir = "../data/lang_test" # TLG.fst, words.txt
-    data = np.load('./data/input.npz')
     beam_size = 10
     batch_size = 50
@@ -64,8 +104,9 @@ if __name__ == "__main__":
     decode_start = time.perf_counter()
     for i in range(counts):
         print("ctc_log_probs.shape:", ctc_log_probs.shape)
-        results = riva_decoder.decode(ctc_log_probs, encoder_out_lens)
-        total_hyps = riva_decoder.get_nbest_list(results)
-        print(total_hyps)
     decode_end = time.perf_counter() - decode_start
     print(f"Decode {ctc_log_probs.shape[0] * counts} sentences, cost {decode_end} seconds")

 import torch
 import os
 from riva.asrlib.decoder.python_decoder import BatchedMappedDecoderCuda, BatchedMappedDecoderCudaConfig
+from typing import List
+def remove_duplicates_and_blank(hyp: List[int],
+                                eos: int,
+                                blank_id: int = 0) -> List[int]:
+    new_hyp: List[int] = []
+    cur = 0
+    while cur < len(hyp):
+        if hyp[cur] != blank_id and hyp[cur] != eos:
+            new_hyp.append(hyp[cur])
+        prev = cur
+        while cur < len(hyp) and hyp[cur] == hyp[prev]:
+            cur += 1
+    return new_hyp
 class RivaWFSTDecoder:
+    def __init__(self, vocab_size, tlg_dir, config_dict=None, beam_size=8):
         config = BatchedMappedDecoderCudaConfig()
+        config.online_opts.decoder_opts.lattice_beam = beam_size
+        config.online_opts.lattice_postprocessor_opts.acoustic_scale = 10.0 # noqa
         config.n_input_per_chunk = 50
         config.online_opts.decoder_opts.default_beam = 17.0
         config.online_opts.decoder_opts.max_active = 7000
         config.online_opts.determinize_lattice = True
         config.online_opts.max_batch_size = 800
         config.online_opts.num_channels = 800
         config.online_opts.frame_shift_seconds = 0.04
         config.online_opts.lattice_postprocessor_opts.lm_scale = 5.0
         config.online_opts.lattice_postprocessor_opts.word_ins_penalty = 0.0
+        config.online_opts.lattice_postprocessor_opts.nbest = beam_size
         self.decoder = BatchedMappedDecoderCuda(
+            config, os.path.join(tlg_dir, "TLG.fst"),
+            os.path.join(tlg_dir, "words.txt"), vocab_size
         )
+        self.word_id_to_word_str = load_word_symbols(os.path.join(tlg_dir, "words.txt"))
+        self.nbest = beam_size
+        self.vocab_size = vocab_size
+    def decode_nbest(self, logits, length):
+        logits = logits.to(torch.float32).contiguous()
         sequence_lengths_tensor = length.to(torch.long).to('cpu').contiguous()
+        results = self.decoder.decode_nbest(logits, sequence_lengths_tensor)
+        total_hyps = []
+        for nbest_sentences in results:
+            nbest_list = []
+            for sent in nbest_sentences:
+                # subtract 1 to get the label id, since fst decoder adds 1 to the label id
+                hyp_ids = [label - 1 for label in sent.ilabels]
+                new_hyp = remove_duplicates_and_blank(hyp_ids, eos=self.vocab_size-1, blank_id=0)
+                nbest_list.append(new_hyp)
+            total_hyps.append(nbest_list)
+        return total_hyps
+    def decode_mbr(self, logits, length):
+        logits = logits.to(torch.float32).contiguous()
+        sequence_lengths_tensor = length.to(torch.long).to('cpu').contiguous()
+        results = self.decoder.decode_mbr(logits, sequence_lengths_tensor)
         total_hyps = []
         for sent in results:
             hyp = [word[0] for word in sent]
             hyp_zh = "".join(hyp)
+            total_hyps.append(hyp_zh)
         return total_hyps
+def load_word_symbols(path):
+    word_id_to_word_str = {}
+    with open(path, "rt") as fh:
+        for line in fh:
+            word_str, word_id = line.rstrip().split()
+            word_id_to_word_str[int(word_id)] = word_str
+    return word_id_to_word_str
 if __name__ == "__main__":
+    lang_dir = "../output" # TLG.fst, words.txt
+    data = np.load('./data/input2.npz')
+    word_id_to_word_str = load_word_symbols(os.path.join(lang_dir, "words.txt"))
+    char_dict = load_word_symbols('./data/words.txt')
     beam_size = 10
     batch_size = 50
     decode_start = time.perf_counter()
     for i in range(counts):
         print("ctc_log_probs.shape:", ctc_log_probs.shape)
+        total_hyps = riva_decoder.decode_mbr(ctc_log_probs, encoder_out_lens)
+        print('mbr', total_hyps)
+        total_hyps = riva_decoder.decode_nbest(ctc_log_probs, encoder_out_lens)
+        print('nbest', total_hyps)
     decode_end = time.perf_counter() - decode_start
     print(f"Decode {ctc_log_probs.shape[0] * counts} sentences, cost {decode_end} seconds")