Add test wav file and jit script with LFS support

Browse files

Files changed (3) hide show

.gitattributes +1 -0
jit_pretrained_streaming.py +271 -0
test_waves/sample_1.wav +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.wav filter=lfs diff=lfs merge=lfs -text

jit_pretrained_streaming.py ADDED Viewed

	@@ -0,0 +1,271 @@

+#!/usr/bin/env python3
+# flake8: noqa
+# Copyright      2022-2023  Xiaomi Corp.        (authors: Fangjun Kuang, Zengwei Yao)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This script loads torchscript models exported by `torch.jit.script()`
+and uses them to decode waves.
+You can use the following command to get the exported models:
+./zipformer/export.py \
+  --exp-dir ./zipformer/exp \
+  --causal 1 \
+  --chunk-size 16 \
+  --left-context-frames 128 \
+  --tokens data/lang_bpe_500/tokens.txt \
+  --epoch 30 \
+  --avg 9 \
+  --jit 1
+Usage of this script:
+./zipformer/jit_pretrained_streaming.py \
+  --nn-model-filename ./zipformer/exp-causal/jit_script_chunk_16_left_128.pt \
+  --tokens ./data/lang_bpe_500/tokens.txt \
+  /path/to/foo.wav \
+"""
+import argparse
+import logging
+from typing import List, Optional
+import k2
+import torch
+import torchaudio
+from kaldifeat import FbankOptions, OnlineFbank, OnlineFeature
+def get_parser():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument(
+        "--nn-model-filename",
+        type=str,
+        required=True,
+        help="Path to the torchscript model jit_script.pt",
+    )
+    parser.add_argument(
+        "--tokens",
+        type=str,
+        help="""Path to tokens.txt.""",
+    )
+    parser.add_argument(
+        "--sample-rate",
+        type=int,
+        default=16000,
+        help="The sample rate of the input sound file",
+    )
+    parser.add_argument(
+        "sound_file",
+        type=str,
+        help="The input sound file(s) to transcribe. "
+        "Supported formats are those supported by torchaudio.load(). "
+        "For example, wav and flac are supported. "
+        "The sample rate has to be 16kHz.",
+    )
+    return parser
+def read_sound_files(
+    filenames: List[str], expected_sample_rate: float
+) -> List[torch.Tensor]:
+    """Read a list of sound files into a list 1-D float32 torch tensors.
+    Args:
+      filenames:
+        A list of sound filenames.
+      expected_sample_rate:
+        The expected sample rate of the sound files.
+    Returns:
+      Return a list of 1-D float32 torch tensors.
+    """
+    ans = []
+    for f in filenames:
+        wave, sample_rate = torchaudio.load(f)
+        assert (
+            sample_rate == expected_sample_rate
+        ), f"expected sample rate: {expected_sample_rate}. Given: {sample_rate}"
+        # We use only the first channel
+        ans.append(wave[0])
+    return ans
+def greedy_search(
+    decoder: torch.jit.ScriptModule,
+    joiner: torch.jit.ScriptModule,
+    encoder_out: torch.Tensor,
+    decoder_out: Optional[torch.Tensor] = None,
+    hyp: Optional[List[int]] = None,
+    device: torch.device = torch.device("cpu"),
+):
+    assert encoder_out.ndim == 2
+    context_size = decoder.context_size
+    blank_id = decoder.blank_id
+    if decoder_out is None:
+        assert hyp is None, hyp
+        hyp = [blank_id] * context_size
+        decoder_input = torch.tensor(hyp, dtype=torch.int32, device=device).unsqueeze(0)
+        # decoder_input.shape (1,, 1 context_size)
+        decoder_out = decoder(decoder_input, torch.tensor([False])).squeeze(1)
+    else:
+        assert decoder_out.ndim == 2
+        assert hyp is not None, hyp
+    T = encoder_out.size(0)
+    for i in range(T):
+        cur_encoder_out = encoder_out[i : i + 1]
+        joiner_out = joiner(cur_encoder_out, decoder_out).squeeze(0)
+        y = joiner_out.argmax(dim=0).item()
+        if y != blank_id:
+            hyp.append(y)
+            decoder_input = hyp[-context_size:]
+            decoder_input = torch.tensor(
+                decoder_input, dtype=torch.int32, device=device
+            ).unsqueeze(0)
+            decoder_out = decoder(decoder_input, torch.tensor([False])).squeeze(1)
+    return hyp, decoder_out
+def create_streaming_feature_extractor(sample_rate) -> OnlineFeature:
+    """Create a CPU streaming feature extractor.
+    At present, we assume it returns a fbank feature extractor with
+    fixed options. In the future, we will support passing in the options
+    from outside.
+    Returns:
+      Return a CPU streaming feature extractor.
+    """
+    opts = FbankOptions()
+    opts.device = "cpu"
+    opts.frame_opts.dither = 0
+    opts.frame_opts.snip_edges = False
+    opts.frame_opts.samp_freq = sample_rate
+    opts.mel_opts.num_bins = 80
+    opts.mel_opts.high_freq = -400
+    return OnlineFbank(opts)
+@torch.no_grad()
+def main():
+    parser = get_parser()
+    args = parser.parse_args()
+    logging.info(vars(args))
+    device = torch.device("cpu")
+    if torch.cuda.is_available():
+        device = torch.device("cuda", 0)
+    logging.info(f"device: {device}")
+    model = torch.jit.load(args.nn_model_filename)
+    model.eval()
+    model.to(device)
+    encoder = model.encoder
+    decoder = model.decoder
+    joiner = model.joiner
+    token_table = k2.SymbolTable.from_file(args.tokens)
+    context_size = decoder.context_size
+    logging.info("Constructing Fbank computer")
+    online_fbank = create_streaming_feature_extractor(args.sample_rate)
+    logging.info(f"Reading sound files: {args.sound_file}")
+    wave_samples = read_sound_files(
+        filenames=[args.sound_file],
+        expected_sample_rate=args.sample_rate,
+    )[0]
+    logging.info(wave_samples.shape)
+    logging.info("Decoding started")
+    chunk_length = encoder.chunk_size * 2
+    T = chunk_length + encoder.pad_length
+    logging.info(f"chunk_length: {chunk_length}")
+    logging.info(f"T: {T}")
+    states = encoder.get_init_states(device=device)
+    tail_padding = torch.zeros(int(0.3 * args.sample_rate), dtype=torch.float32)
+    wave_samples = torch.cat([wave_samples, tail_padding])
+    chunk = int(0.25 * args.sample_rate)  # 0.2 second
+    num_processed_frames = 0
+    hyp = None
+    decoder_out = None
+    start = 0
+    while start < wave_samples.numel():
+        logging.info(f"{start}/{wave_samples.numel()}")
+        end = min(start + chunk, wave_samples.numel())
+        samples = wave_samples[start:end]
+        start += chunk
+        online_fbank.accept_waveform(
+            sampling_rate=args.sample_rate,
+            waveform=samples,
+        )
+        while online_fbank.num_frames_ready - num_processed_frames >= T:
+            frames = []
+            for i in range(T):
+                frames.append(online_fbank.get_frame(num_processed_frames + i))
+            frames = torch.cat(frames, dim=0).to(device).unsqueeze(0)
+            x_lens = torch.tensor([T], dtype=torch.int32, device=device)
+            encoder_out, out_lens, states = encoder(
+                features=frames,
+                feature_lengths=x_lens,
+                states=states,
+            )
+            num_processed_frames += chunk_length
+            hyp, decoder_out = greedy_search(
+                decoder, joiner, encoder_out.squeeze(0), decoder_out, hyp, device=device
+            )
+    text = ""
+    for i in hyp[context_size:]:
+        text += token_table[i]
+    text = text.replace("▁", " ").strip()
+    logging.info(args.sound_file)
+    logging.info(text)
+    logging.info("Decoding Done")
+torch.set_num_threads(4)
+torch.set_num_interop_threads(1)
+torch._C._jit_set_profiling_executor(False)
+torch._C._jit_set_profiling_mode(False)
+torch._C._set_graph_executor_optimize(False)
+if __name__ == "__main__":
+    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+    logging.basicConfig(format=formatter, level=logging.INFO)
+    main()

test_waves/sample_1.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5fbc9032780bc73e4a396f21806138a44f520fcfe07cbcae4f13ca48f44d0198
+size 229420