yuekai
/

aishell1_tlg_essentials

Model card Files Files and versions

xet

Community

Yuekai Zhang commited on Jun 7, 2023

Commit

170cf1f

1 Parent(s): 96f87e0

add reproducable bug

Browse files

Files changed (2) hide show

test/test_frame_reducer.py +2 -2
test/test_riva_wfst_decoder.py +17 -12

test/test_frame_reducer.py CHANGED Viewed

@@ -96,7 +96,7 @@ class FrameReducer(nn.Module):
         """
         N, T, C = x.size()
-        padding_mask = make_pad_mask(x_lens)
         non_blank_mask = (ctc_output[:, :, blank_id] < math.log(0.9)) * (~padding_mask)
         if y_lens is not None:
@@ -188,4 +188,4 @@ if __name__ == "__main__":
         avg_time += delta_time
     print(x_fr.shape)
     print(x_lens_fr)
-    print(avg_time / test_times)

         """
         N, T, C = x.size()
+        padding_mask = make_pad_mask(x_lens, x.size(1))
         non_blank_mask = (ctc_output[:, :, blank_id] < math.log(0.9)) * (~padding_mask)
         if y_lens is not None:
         avg_time += delta_time
     print(x_fr.shape)
     print(x_lens_fr)
+    print(avg_time / test_times)

test/test_riva_wfst_decoder.py CHANGED Viewed

@@ -6,6 +6,8 @@ from riva.asrlib.decoder.python_decoder import BatchedMappedDecoderCuda, Batched
 from typing import List
 from test_frame_reducer import FrameReducer
 def remove_duplicates_and_blank(hyp: List[int],
                                 eos: int,
                                 blank_id: int = 0) -> List[int]:
@@ -37,6 +39,7 @@ class RivaWFSTDecoder:
         config.online_opts.decoder_opts.blank_penalty = 0.95
         config.online_opts.num_post_processing_worker_threads = 16
         config.online_opts.num_decoder_copy_threads = 4
         #config.online_opts.decoder_opts.ntokens_pre_allocated = 10_000_000
@@ -77,6 +80,7 @@ class RivaWFSTDecoder:
         results = self.decoder.decode_mbr(logits, sequence_lengths_tensor)
         total_hyps = []
         for sent in results:
             hyp = [word[0] for word in sent]
             hyp_zh = "".join(hyp)
             total_hyps.append(hyp_zh)
@@ -94,24 +98,25 @@ def load_word_symbols(path):
 if __name__ == "__main__":
     lang_dir = "../output" # TLG.fst, words.txt
-    data = np.load('./data/input2.npz')
     word_id_to_word_str = load_word_symbols(os.path.join(lang_dir, "words.txt"))
     char_dict = load_word_symbols('./data/words.txt')
-    beam_size = 10
-    batch_size = 10
-    counts = 10
     # ctc_log_probs [1,103,4233]
     ctc_log_probs = torch.from_numpy(data['ctc_log_probs'])
     # ctc_log_probs , [batch_size,T,vocab_size ]
     ctc_log_probs = ctc_log_probs.repeat(batch_size,1,1)
-    encoder_out_lens = torch.from_numpy(data['encoder_out_lens'])   # encoder_out_lens single element 103
     encoder_out_lens = encoder_out_lens.repeat(batch_size)          # [batch_size]
     ctc_log_probs = ctc_log_probs.contiguous().cuda()
     frame_reducer = FrameReducer()
-    ctc_log_probs, encoder_out_lens = frame_reducer(ctc_log_probs, encoder_out_lens.cuda(), ctc_log_probs)
     vocab_size = ctc_log_probs.shape[2]
     riva_decoder = RivaWFSTDecoder(vocab_size, lang_dir, beam_size)
@@ -120,9 +125,9 @@ if __name__ == "__main__":
     for i in range(counts):
         print("ctc_log_probs.shape:", ctc_log_probs.shape)
         total_hyps = riva_decoder.decode_mbr(ctc_log_probs, encoder_out_lens)
-        print('mbr', total_hyps)
-        # total_hyps = riva_decoder.decode_nbest(ctc_log_probs, encoder_out_lens)
-        # print('nbest', total_hyps)
     decode_end = time.perf_counter() - decode_start
     #chunk_size = 32
     ctc_log_probs_list, is_first_chunk, is_last_chunk = [], [True] * batch_size, [True] * batch_size
@@ -131,13 +136,13 @@ if __name__ == "__main__":
         success = riva_decoder.online_decoder.try_init_corr_id(corr_id)
         assert success
     for i in range(batch_size):
-        #ctc_log_probs_list.append(ctc_log_probs[i,:chunk_size,:])
         ctc_log_probs_list.append(ctc_log_probs[i,:,:])
     channels, partial_hypotheses = \
     riva_decoder.online_decoder.decode_batch(corr_ids, ctc_log_probs_list,
                             is_first_chunk, is_last_chunk)
     for j, ph in enumerate(partial_hypotheses):
-        print(j, ph.words, ph.score, ph.ilabels)
-    print(f"Decode {ctc_log_probs.shape[0] * counts} sentences, cost {decode_end} seconds")

 from typing import List
 from test_frame_reducer import FrameReducer
+USE_FINAL_PROBS=False
 def remove_duplicates_and_blank(hyp: List[int],
                                 eos: int,
                                 blank_id: int = 0) -> List[int]:
         config.online_opts.decoder_opts.blank_penalty = 0.95
         config.online_opts.num_post_processing_worker_threads = 16
         config.online_opts.num_decoder_copy_threads = 4
+        config.online_opts.use_final_probs = USE_FINAL_PROBS
         #config.online_opts.decoder_opts.ntokens_pre_allocated = 10_000_000
         results = self.decoder.decode_mbr(logits, sequence_lengths_tensor)
         total_hyps = []
         for sent in results:
+            #print([word for word in sent])
             hyp = [word[0] for word in sent]
             hyp_zh = "".join(hyp)
             total_hyps.append(hyp_zh)
 if __name__ == "__main__":
     lang_dir = "../output" # TLG.fst, words.txt
+    data = np.load('./data/input3.npz')
     word_id_to_word_str = load_word_symbols(os.path.join(lang_dir, "words.txt"))
     char_dict = load_word_symbols('./data/words.txt')
+    beam_size = 7
+    batch_size = 1
+    counts = 1
     # ctc_log_probs [1,103,4233]
     ctc_log_probs = torch.from_numpy(data['ctc_log_probs'])
     # ctc_log_probs , [batch_size,T,vocab_size ]
     ctc_log_probs = ctc_log_probs.repeat(batch_size,1,1)
+    encoder_out_lens = torch.from_numpy(data['encoder_out_len'])   # encoder_out_lens single element 103
+    #encoder_out_lens = torch.from_numpy(data['encoder_out_lens'])   # encoder_out_lens single element 103
     encoder_out_lens = encoder_out_lens.repeat(batch_size)          # [batch_size]
     ctc_log_probs = ctc_log_probs.contiguous().cuda()
     frame_reducer = FrameReducer()
+    #ctc_log_probs, encoder_out_lens = frame_reducer(ctc_log_probs, encoder_out_lens.cuda(), ctc_log_probs)
     vocab_size = ctc_log_probs.shape[2]
     riva_decoder = RivaWFSTDecoder(vocab_size, lang_dir, beam_size)
     for i in range(counts):
         print("ctc_log_probs.shape:", ctc_log_probs.shape)
         total_hyps = riva_decoder.decode_mbr(ctc_log_probs, encoder_out_lens)
+        print('mbr', 'use_final_probs:', USE_FINAL_PROBS, total_hyps)
+        #total_hyps = riva_decoder.decode_nbest(ctc_log_probs, encoder_out_lens)
+        #print('nbest', total_hyps)
     decode_end = time.perf_counter() - decode_start
     #chunk_size = 32
     ctc_log_probs_list, is_first_chunk, is_last_chunk = [], [True] * batch_size, [True] * batch_size
         success = riva_decoder.online_decoder.try_init_corr_id(corr_id)
         assert success
     for i in range(batch_size):
         ctc_log_probs_list.append(ctc_log_probs[i,:,:])
     channels, partial_hypotheses = \
     riva_decoder.online_decoder.decode_batch(corr_ids, ctc_log_probs_list,
                             is_first_chunk, is_last_chunk)
     for j, ph in enumerate(partial_hypotheses):
+        #print("streaming word ids", ph.words, ph.score)
+        pass
+    #print(f"Decode {ctc_log_probs.shape[0] * counts} sentences, cost {decode_end} seconds")