Spaces:

FireRedTeam
/

FireRedASR

Running on Zero

App Files Files Community

FireRedTeamUser commited on Sep 22, 2025

Commit

8948419

1 Parent(s): 8c4ce15

add fireredasr correctly

Browse files

Files changed (28) hide show

fireredasr +0 -1
fireredasr/LICENSE +201 -0
fireredasr/README.md +160 -0
fireredasr/examples/fireredasr +1 -0
fireredasr/examples/inference_fireredasr_aed.sh +33 -0
fireredasr/examples/inference_fireredasr_llm.sh +32 -0
fireredasr/examples/pretrained_models +1 -0
fireredasr/examples/wav/IT0011W0001.wav +0 -0
fireredasr/examples/wav/TEST_NET_Y0000000000_-KTKHdZ2fb8_S00000.wav +0 -0
fireredasr/examples/wav/text +4 -0
fireredasr/examples/wav/wav.scp +4 -0
fireredasr/fireredasr/.speech2text.py.swp +0 -0
fireredasr/fireredasr/data/asr_feat.py +107 -0
fireredasr/fireredasr/data/token_dict.py +59 -0
fireredasr/fireredasr/models/.fireredasr.py.swp +0 -0
fireredasr/fireredasr/models/fireredasr.py +125 -0
fireredasr/fireredasr/models/fireredasr_aed.py +35 -0
fireredasr/fireredasr/models/fireredasr_llm.py +272 -0
fireredasr/fireredasr/models/module/adapter.py +30 -0
fireredasr/fireredasr/models/module/conformer_encoder.py +322 -0
fireredasr/fireredasr/models/module/transformer_decoder.py +299 -0
fireredasr/fireredasr/speech2text.py +105 -0
fireredasr/fireredasr/tokenizer/aed_tokenizer.py +67 -0
fireredasr/fireredasr/tokenizer/llm_tokenizer.py +105 -0
fireredasr/fireredasr/utils/param.py +13 -0
fireredasr/fireredasr/utils/wer.py +303 -0
fireredasr/pretrained_models/README.md +1 -0
fireredasr/requirements.txt +8 -0

fireredasr DELETED Viewed

	@@ -1 +0,0 @@
1	- Subproject commit 1eadb81b66eca948cd492bc0aeedd786333c049d

fireredasr/LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

fireredasr/README.md ADDED Viewed

	@@ -0,0 +1,160 @@

+<div align="center">
+<h1>FireRedASR: Open-Source Industrial-Grade
+<br>
+Automatic Speech Recognition Models</h1>
+</div>
+[[Paper]](https://arxiv.org/pdf/2501.14350)
+[[Model]](https://huggingface.co/fireredteam)
+[[Blog]](https://fireredteam.github.io/demos/firered_asr/)
+FireRedASR is a family of open-source industrial-grade automatic speech recognition (ASR) models supporting Mandarin, Chinese dialects and English, achieving a new state-of-the-art (SOTA) on public Mandarin ASR benchmarks, while also offering outstanding singing lyrics recognition capability.
+## 🔥 News
+- [2025/02/17] We release [FireRedASR-LLM-L](https://huggingface.co/fireredteam/FireRedASR-LLM-L/tree/main) model weights.
+- [2025/01/24] We release [technical report](https://arxiv.org/pdf/2501.14350), [blog](https://fireredteam.github.io/demos/firered_asr/), and [FireRedASR-AED-L](https://huggingface.co/fireredteam/FireRedASR-AED-L/tree/main) model weights.
+## Method
+FireRedASR is designed to meet diverse requirements in superior performance and optimal efficiency across various applications. It comprises two variants:
+- FireRedASR-LLM: Designed to achieve state-of-the-art (SOTA) performance and to enable seamless end-to-end speech interaction. It adopts an Encoder-Adapter-LLM framework leveraging large language model (LLM) capabilities.
+- FireRedASR-AED: Designed to balance high performance and computational efficiency and to serve as an effective speech representation module in LLM-based speech models. It utilizes an Attention-based Encoder-Decoder (AED) architecture.
+![Model](/assets/FireRedASR_model.png)
+## Evaluation
+Results are reported in Character Error Rate (CER%) for Chinese and Word Error Rate (WER%) for English.
+### Evaluation on Public Mandarin ASR Benchmarks
+| Model            | #Params | aishell1 | aishell2 | ws\_net  | ws\_meeting | Average-4 |
+|:----------------:|:-------:|:--------:|:--------:|:--------:|:-----------:|:---------:|
+| FireRedASR-LLM   | 8.3B | 0.76 | 2.15 | 4.60 | 4.67 | 3.05 |
+| FireRedASR-AED   | 1.1B | 0.55 | 2.52 | 4.88 | 4.76 | 3.18 |
+| Seed-ASR         | 12B+ | 0.68 | 2.27 | 4.66 | 5.69 | 3.33 |
+| Qwen-Audio       | 8.4B | 1.30 | 3.10 | 9.50 | 10.87 | 6.19 |
+| SenseVoice-L     | 1.6B | 2.09 | 3.04 | 6.01 | 6.73 | 4.47 |
+| Whisper-Large-v3 | 1.6B | 5.14 | 4.96 | 10.48 | 18.87 | 9.86 |
+| Paraformer-Large | 0.2B | 1.68 | 2.85 | 6.74 | 6.97 | 4.56 |
+`ws` means WenetSpeech.
+### Evaluation on Public Chinese Dialect and English ASR Benchmarks
+|Test Set       | KeSpeech | LibriSpeech test-clean | LibriSpeech test-other  |
+| :------------:| :------: | :--------------------: | :----------------------:|
+|FireRedASR-LLM | 3.56 | 1.73 | 3.67 |
+|FireRedASR-AED | 4.48 | 1.93 | 4.44 |
+|Previous SOTA Results | 6.70 | 1.82 | 3.50 |
+## Usage
+Download model files from [huggingface](https://huggingface.co/fireredteam) and place them in the folder `pretrained_models`.
+If you want to use `FireRedASR-LLM-L`, you also need to download [Qwen2-7B-Instruct](https://huggingface.co/Qwen/Qwen2-7B-Instruct) and place it in the folder `pretrained_models`. Then, go to folder `FireRedASR-LLM-L` and run `$ ln -s ../Qwen2-7B-Instruct`
+### Setup
+Create a Python environment and install dependencies
+```bash
+$ git clone https://github.com/FireRedTeam/FireRedASR.git
+$ conda create --name fireredasr python=3.10
+$ pip install -r requirements.txt
+```
+Set up Linux PATH and PYTHONPATH
+```
+$ export PATH=$PWD/fireredasr/:$PWD/fireredasr/utils/:$PATH
+$ export PYTHONPATH=$PWD/:$PYTHONPATH
+```
+Convert audio to 16kHz 16-bit PCM format
+```
+ffmpeg -i input_audio -ar 16000 -ac 1 -acodec pcm_s16le -f wav output.wav
+```
+### Quick Start
+```bash
+$ cd examples
+$ bash inference_fireredasr_aed.sh
+$ bash inference_fireredasr_llm.sh
+```
+### Command-line Usage
+```bash
+$ speech2text.py --help
+$ speech2text.py --wav_path examples/wav/BAC009S0764W0121.wav --asr_type "aed" --model_dir pretrained_models/FireRedASR-AED-L
+$ speech2text.py --wav_path examples/wav/BAC009S0764W0121.wav --asr_type "llm" --model_dir pretrained_models/FireRedASR-LLM-L
+```
+### Python Usage
+```python
+from fireredasr.models.fireredasr import FireRedAsr
+batch_uttid = ["BAC009S0764W0121"]
+batch_wav_path = ["examples/wav/BAC009S0764W0121.wav"]
+# FireRedASR-AED
+model = FireRedAsr.from_pretrained("aed", "pretrained_models/FireRedASR-AED-L")
+results = model.transcribe(
+    batch_uttid,
+    batch_wav_path,
+    {
+        "use_gpu": 1,
+        "beam_size": 3,
+        "nbest": 1,
+        "decode_max_len": 0,
+        "softmax_smoothing": 1.25,
+        "aed_length_penalty": 0.6,
+        "eos_penalty": 1.0
+    }
+)
+print(results)
+# FireRedASR-LLM
+model = FireRedAsr.from_pretrained("llm", "pretrained_models/FireRedASR-LLM-L")
+results = model.transcribe(
+    batch_uttid,
+    batch_wav_path,
+    {
+        "use_gpu": 1,
+        "beam_size": 3,
+        "decode_max_len": 0,
+        "decode_min_len": 0,
+        "repetition_penalty": 3.0,
+        "llm_length_penalty": 1.0,
+        "temperature": 1.0
+    }
+)
+print(results)
+```
+## Usage Tips
+### Batch Beam Search
+- When performing batch beam search with FireRedASR-LLM, please ensure that the input lengths of the utterances are similar. If there are significant differences in utterance lengths, shorter utterances may experience repetition issues. You can either sort your dataset by length or set `batch_size` to 1 to avoid the repetition issue.
+### Input Length Limitations
+- FireRedASR-AED supports audio input up to 60s. Input longer than 60s may cause hallucination issues, and input exceeding 200s will trigger positional encoding errors.
+- FireRedASR-LLM supports audio input up to 30s. The behavior for longer input is currently unknown.
+## Acknowledgements
+Thanks to the following open-source works:
+- [Qwen2-7B-Instruct](https://huggingface.co/Qwen/Qwen2-7B-Instruct)
+- [icefall/ASR_LLM](https://github.com/k2-fsa/icefall/tree/master/egs/speech_llm/ASR_LLM)
+- [WeNet](https://github.com/wenet-e2e/wenet)
+- [Speech-Transformer](https://github.com/kaituoxu/Speech-Transformer)
+## Citation
+```bibtex
+@article{xu2025fireredasr,
+  title={FireRedASR: Open-Source Industrial-Grade Mandarin Speech Recognition Models from Encoder-Decoder to LLM Integration},
+  author={Xu, Kai-Tuo and Xie, Feng-Long and Tang, Xu and Hu, Yao},
+  journal={arXiv preprint arXiv:2501.14350},
+  year={2025}
+}
+```

fireredasr/examples/fireredasr ADDED Viewed

	@@ -0,0 +1 @@


1	+ ../fireredasr

fireredasr/examples/inference_fireredasr_aed.sh ADDED Viewed

	@@ -0,0 +1,33 @@

+#!/bin/bash
+export PATH=$PWD/fireredasr/:$PWD/fireredasr/utils/:$PATH
+export PYTHONPATH=$PWD/:$PYTHONPATH
+# model_dir includes model.pth.tar, cmvn.ark, dict.txt
+model_dir=$PWD/pretrained_models/FireRedASR-AED-L
+# Support several input format
+wavs="--wav_path wav/BAC009S0764W0121.wav"
+wavs="--wav_paths wav/BAC009S0764W0121.wav wav/IT0011W0001.wav wav/TEST_NET_Y0000000000_-KTKHdZ2fb8_S00000.wav wav/TEST_MEETING_T0000000001_S00000.wav"
+wavs="--wav_dir wav/"
+wavs="--wav_scp wav/wav.scp"
+out="out/aed-l-asr.txt"
+decode_args="
+--batch_size 2 --beam_size 3 --nbest 1
+--decode_max_len 0 --softmax_smoothing 1.25 --aed_length_penalty 0.6
+--eos_penalty 1.0
+"
+mkdir -p $(dirname $out)
+set -x
+CUDA_VISIBLE_DEVICES=0 \
+speech2text.py --asr_type "aed" --model_dir $model_dir $decode_args $wavs --output $out
+ref="wav/text"
+wer.py --print_sentence_wer 1 --do_tn 0 --rm_special 0 --ref $ref --hyp $out > $out.wer 2>&1
+tail -n8 $out.wer

fireredasr/examples/inference_fireredasr_llm.sh ADDED Viewed

	@@ -0,0 +1,32 @@

+#!/bin/bash
+export PATH=$PWD/fireredasr/:$PWD/fireredasr/utils/:$PATH
+export PYTHONPATH=$PWD/:$PYTHONPATH
+# model_dir includes model.pth.tar, asr_encoder.pth.tar, cmvn.ark, Qwen2-7B-Instruct
+model_dir=$PWD/pretrained_models/FireRedASR-LLM-L
+# Support several input format
+wavs="--wav_path wav/BAC009S0764W0121.wav"
+wavs="--wav_paths wav/BAC009S0764W0121.wav wav/IT0011W0001.wav wav/TEST_NET_Y0000000000_-KTKHdZ2fb8_S00000.wav wav/TEST_MEETING_T0000000001_S00000.wav"
+wavs="--wav_dir wav/"
+wavs="--wav_scp wav/wav.scp"
+out="out/llm-l-asr.txt"
+decode_args="
+--batch_size 1 --beam_size 3 --decode_max_len 0 --decode_min_len 0
+--repetition_penalty 3.0 --llm_length_penalty 1.0 --temperature 1.0
+"
+mkdir -p $(dirname $out)
+set -x
+CUDA_VISIBLE_DEVICES=0 \
+speech2text.py --asr_type "llm" --model_dir $model_dir $decode_args $wavs --output $out
+ref="wav/text"
+wer.py --print_sentence_wer 1 --do_tn 0 --rm_special 1 --ref $ref --hyp $out > $out.wer 2>&1
+tail -n8 $out.wer

fireredasr/examples/pretrained_models ADDED Viewed

	@@ -0,0 +1 @@


1	+ ../pretrained_models

fireredasr/examples/wav/IT0011W0001.wav ADDED Viewed

Binary file (63.8 kB). View file

fireredasr/examples/wav/TEST_NET_Y0000000000_-KTKHdZ2fb8_S00000.wav ADDED Viewed

Binary file (57.6 kB). View file

fireredasr/examples/wav/text ADDED Viewed

	@@ -0,0 +1,4 @@

+BAC009S0764W0121     甚至  出现  交易  几乎  停滞  的  情况
+IT0011W0001     换一首歌
+TEST_NET_Y0000000000_-KTKHdZ2fb8_S00000 我有的时候说不清楚你们知道吗
+TEST_MEETING_T0000000001_S00000 好首先说一下刚才这个经理说完的这个销售问题咱再说一下咱们的商场问题首先咱们商场上半年业这个先各部门儿汇报一下就是业绩

fireredasr/examples/wav/wav.scp ADDED Viewed

	@@ -0,0 +1,4 @@

+BAC009S0764W0121 wav/BAC009S0764W0121.wav
+IT0011W0001 wav/IT0011W0001.wav
+TEST_NET_Y0000000000_-KTKHdZ2fb8_S00000 wav/TEST_NET_Y0000000000_-KTKHdZ2fb8_S00000.wav
+TEST_MEETING_T0000000001_S00000 wav/TEST_MEETING_T0000000001_S00000.wav

fireredasr/fireredasr/.speech2text.py.swp ADDED Viewed

Binary file (12.3 kB). View file

fireredasr/fireredasr/data/asr_feat.py ADDED Viewed

	@@ -0,0 +1,107 @@

+import math
+import os
+import kaldiio
+import kaldi_native_fbank as knf
+import numpy as np
+import torch
+class ASRFeatExtractor:
+    def __init__(self, kaldi_cmvn_file):
+        self.cmvn = CMVN(kaldi_cmvn_file) if kaldi_cmvn_file != "" else None
+        self.fbank = KaldifeatFbank(num_mel_bins=80, frame_length=25,
+            frame_shift=10, dither=0.0)
+    def __call__(self, wav_paths):
+        feats = []
+        durs = []
+        for wav_path in wav_paths:
+            sample_rate, wav_np = kaldiio.load_mat(wav_path)
+            dur = wav_np.shape[0] / sample_rate
+            fbank = self.fbank((sample_rate, wav_np))
+            if self.cmvn is not None:
+                fbank = self.cmvn(fbank)
+            fbank = torch.from_numpy(fbank).float()
+            feats.append(fbank)
+            durs.append(dur)
+        lengths = torch.tensor([feat.size(0) for feat in feats]).long()
+        feats_pad = self.pad_feat(feats, 0.0)
+        return feats_pad, lengths, durs
+    def pad_feat(self, xs, pad_value):
+        # type: (List[Tensor], int) -> Tensor
+        n_batch = len(xs)
+        max_len = max([xs[i].size(0) for i in range(n_batch)])
+        pad = torch.ones(n_batch, max_len, *xs[0].size()[1:]).to(xs[0].device).to(xs[0].dtype).fill_(pad_value)
+        for i in range(n_batch):
+            pad[i, :xs[i].size(0)] = xs[i]
+        return pad
+class CMVN:
+    def __init__(self, kaldi_cmvn_file):
+        self.dim, self.means, self.inverse_std_variences = \
+            self.read_kaldi_cmvn(kaldi_cmvn_file)
+    def __call__(self, x, is_train=False):
+        assert x.shape[-1] == self.dim, "CMVN dim mismatch"
+        out = x - self.means
+        out = out * self.inverse_std_variences
+        return out
+    def read_kaldi_cmvn(self, kaldi_cmvn_file):
+        assert os.path.exists(kaldi_cmvn_file)
+        stats = kaldiio.load_mat(kaldi_cmvn_file)
+        assert stats.shape[0] == 2
+        dim = stats.shape[-1] - 1
+        count = stats[0, dim]
+        assert count >= 1
+        floor = 1e-20
+        means = []
+        inverse_std_variences = []
+        for d in range(dim):
+            mean = stats[0, d] / count
+            means.append(mean.item())
+            varience = (stats[1, d] / count) - mean*mean
+            if varience < floor:
+                varience = floor
+            istd = 1.0 / math.sqrt(varience)
+            inverse_std_variences.append(istd)
+        return dim, np.array(means), np.array(inverse_std_variences)
+class KaldifeatFbank:
+    def __init__(self, num_mel_bins=80, frame_length=25, frame_shift=10,
+                 dither=1.0):
+        self.dither = dither
+        opts = knf.FbankOptions()
+        opts.frame_opts.dither = dither
+        opts.mel_opts.num_bins = num_mel_bins
+        opts.frame_opts.snip_edges = True
+        opts.mel_opts.debug_mel = False
+        self.opts = opts
+    def __call__(self, wav, is_train=False):
+        if type(wav) is str:
+            sample_rate, wav_np = kaldiio.load_mat(wav)
+        elif type(wav) in [tuple, list] and len(wav) == 2:
+            sample_rate, wav_np = wav
+        assert len(wav_np.shape) == 1
+        dither = self.dither if is_train else 0.0
+        self.opts.frame_opts.dither = dither
+        fbank = knf.OnlineFbank(self.opts)
+        fbank.accept_waveform(sample_rate, wav_np.tolist())
+        feat = []
+        for i in range(fbank.num_frames_ready):
+            feat.append(fbank.get_frame(i))
+        if len(feat) == 0:
+            print("Check data, len(feat) == 0", wav, flush=True)
+            return np.zeros((0, self.opts.mel_opts.num_bins))
+        feat = np.vstack(feat)
+        return feat

fireredasr/fireredasr/data/token_dict.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import logging
+class TokenDict:
+    def __init__(self, dict_path, unk=""):
+        assert dict_path != ""
+        self.id2word, self.word2id = self.read_dict(dict_path)
+        self.unk = unk
+        assert unk == "" or unk in self.word2id
+        self.unkid = self.word2id[unk] if unk else -1
+    def get(self, key, default):
+        if type(default) == str:
+            default = self.word2id[default]
+        return self.word2id.get(key, default)
+    def __getitem__(self, key):
+        if type(key) == str:
+            if self.unk:
+                return self.word2id.get(key, self.word2id[self.unk])
+            else:
+                return self.word2id[key]
+        elif type(key) == int:
+            return self.id2word[key]
+        else:
+            raise TypeError("Key should be str or int")
+    def __len__(self):
+        return len(self.id2word)
+    def __contains__(self, query):
+        if type(query) == str:
+            return query in self.word2id
+        elif type(query) == int:
+            return query in self.id2word
+        else:
+            raise TypeError("query should be str or int")
+    def read_dict(self, dict_path):
+        id2word, word2id = [], {}
+        with open(dict_path, encoding='utf8') as f:
+            for i, line in enumerate(f):
+                tokens = line.strip().split()
+                if len(tokens) >= 2:
+                    word, index = tokens[0], int(tokens[1])
+                elif len(tokens) == 1:
+                    word, index = tokens[0], i
+                else:  # empty line or space
+                    logging.info(f"Find empty line or space '{line.strip()}' in {dict_path}:L{i}, set to ' '")
+                    word, index = " ", i
+                assert len(id2word) == index
+                assert len(word2id) == index
+                if word == "<space>":
+                    logging.info(f"NOTE: Find <space> in {dict_path}:L{i} and convert it to ' '")
+                    word = " "
+                word2id[word] = index
+                id2word.append(word)
+        assert len(id2word) == len(word2id)
+        return id2word, word2id

fireredasr/fireredasr/models/.fireredasr.py.swp ADDED Viewed

Binary file (16.4 kB). View file

fireredasr/fireredasr/models/fireredasr.py ADDED Viewed

	@@ -0,0 +1,125 @@

+import os
+import time
+import torch
+from fireredasr.data.asr_feat import ASRFeatExtractor
+from fireredasr.models.fireredasr_aed import FireRedAsrAed
+from fireredasr.models.fireredasr_llm import FireRedAsrLlm
+from fireredasr.tokenizer.aed_tokenizer import ChineseCharEnglishSpmTokenizer
+from fireredasr.tokenizer.llm_tokenizer import LlmTokenizerWrapper
+class FireRedAsr:
+    @classmethod
+    def from_pretrained(cls, asr_type, model_dir):
+        assert asr_type in ["aed", "llm"]
+        cmvn_path = os.path.join(model_dir, "cmvn.ark")
+        feat_extractor = ASRFeatExtractor(cmvn_path)
+        if asr_type == "aed":
+            model_path = os.path.join(model_dir, "model.pth.tar")
+            dict_path =os.path.join(model_dir, "dict.txt")
+            spm_model = os.path.join(model_dir, "train_bpe1000.model")
+            model = load_fireredasr_aed_model(model_path)
+            tokenizer = ChineseCharEnglishSpmTokenizer(dict_path, spm_model)
+        elif asr_type == "llm":
+            model_path = os.path.join(model_dir, "model.pth.tar")
+            encoder_path = os.path.join(model_dir, "asr_encoder.pth.tar")
+            llm_dir = os.path.join(model_dir, "Qwen2-7B-Instruct")
+            model, tokenizer = load_firered_llm_model_and_tokenizer(
+                model_path, encoder_path, llm_dir)
+        model.eval()
+        return cls(asr_type, feat_extractor, model, tokenizer)
+    def __init__(self, asr_type, feat_extractor, model, tokenizer):
+        self.asr_type = asr_type
+        self.feat_extractor = feat_extractor
+        self.model = model
+        self.tokenizer = tokenizer
+    @torch.no_grad()
+    def transcribe(self, batch_uttid, batch_wav_path, args={}):
+        feats, lengths, durs = self.feat_extractor(batch_wav_path)
+        total_dur = sum(durs)
+        if args.get("use_gpu", False):
+            feats, lengths = feats.cuda(), lengths.cuda()
+            self.model.cuda()
+        else:
+            self.model.cpu()
+        if self.asr_type == "aed":
+            start_time = time.time()
+            hyps = self.model.transcribe(
+                feats, lengths,
+                args.get("beam_size", 1),
+                args.get("nbest", 1),
+                args.get("decode_max_len", 0),
+                args.get("softmax_smoothing", 1.0),
+                args.get("aed_length_penalty", 0.0),
+                args.get("eos_penalty", 1.0)
+            )
+            elapsed = time.time() - start_time
+            rtf= elapsed / total_dur if total_dur > 0 else 0
+            results = []
+            for uttid, wav, hyp in zip(batch_uttid, batch_wav_path, hyps):
+                hyp = hyp[0]  # only return 1-best
+                hyp_ids = [int(id) for id in hyp["yseq"].cpu()]
+                text = self.tokenizer.detokenize(hyp_ids)
+                results.append({"uttid": uttid, "text": text, "wav": wav,
+                    "rtf": f"{rtf:.4f}"})
+            return results
+        elif self.asr_type == "llm":
+            input_ids, attention_mask, _, _ = \
+                LlmTokenizerWrapper.preprocess_texts(
+                    origin_texts=[""]*feats.size(0), tokenizer=self.tokenizer,
+                    max_len=128, decode=True)
+            if args.get("use_gpu", False):
+                input_ids = input_ids.cuda()
+                attention_mask = attention_mask.cuda()
+            start_time = time.time()
+            generated_ids = self.model.transcribe(
+                feats, lengths, input_ids, attention_mask,
+                args.get("beam_size", 1),
+                args.get("decode_max_len", 0),
+                args.get("decode_min_len", 0),
+                args.get("repetition_penalty", 1.0),
+                args.get("llm_length_penalty", 0.0),
+                args.get("temperature", 1.0)
+            )
+            elapsed = time.time() - start_time
+            rtf= elapsed / total_dur if total_dur > 0 else 0
+            texts = self.tokenizer.batch_decode(generated_ids,
+                                                skip_special_tokens=True)
+            results = []
+            for uttid, wav, text in zip(batch_uttid, batch_wav_path, texts):
+                results.append({"uttid": uttid, "text": text, "wav": wav,
+                                "rtf": f"{rtf:.4f}"})
+            return results
+def load_fireredasr_aed_model(model_path):
+    package = torch.load(model_path, map_location=lambda storage, loc: storage)
+    print("model args:", package["args"])
+    model = FireRedAsrAed.from_args(package["args"])
+    model.load_state_dict(package["model_state_dict"], strict=True)
+    return model
+def load_firered_llm_model_and_tokenizer(model_path, encoder_path, llm_dir):
+    package = torch.load(model_path, map_location=lambda storage, loc: storage)
+    package["args"].encoder_path = encoder_path
+    package["args"].llm_dir = llm_dir
+    print("model args:", package["args"])
+    model = FireRedAsrLlm.from_args(package["args"])
+    model.load_state_dict(package["model_state_dict"], strict=False)
+    tokenizer = LlmTokenizerWrapper.build_llm_tokenizer(llm_dir)
+    return model, tokenizer

fireredasr/fireredasr/models/fireredasr_aed.py ADDED Viewed

	@@ -0,0 +1,35 @@

+import torch
+from fireredasr.models.module.conformer_encoder import ConformerEncoder
+from fireredasr.models.module.transformer_decoder import TransformerDecoder
+class FireRedAsrAed(torch.nn.Module):
+    @classmethod
+    def from_args(cls, args):
+        return cls(args)
+    def __init__(self, args):
+        super().__init__()
+        self.sos_id = args.sos_id
+        self.eos_id = args.eos_id
+        self.encoder = ConformerEncoder(
+            args.idim, args.n_layers_enc, args.n_head, args.d_model,
+            args.residual_dropout, args.dropout_rate,
+            args.kernel_size, args.pe_maxlen)
+        self.decoder = TransformerDecoder(
+            args.sos_id, args.eos_id, args.pad_id, args.odim,
+            args.n_layers_dec, args.n_head, args.d_model,
+            args.residual_dropout, args.pe_maxlen)
+    def transcribe(self, padded_input, input_lengths,
+                   beam_size=1, nbest=1, decode_max_len=0,
+                   softmax_smoothing=1.0, length_penalty=0.0, eos_penalty=1.0):
+        enc_outputs, _, enc_mask = self.encoder(padded_input, input_lengths)
+        nbest_hyps = self.decoder.batch_beam_search(
+            enc_outputs, enc_mask,
+            beam_size, nbest, decode_max_len,
+            softmax_smoothing, length_penalty, eos_penalty)
+        return nbest_hyps

fireredasr/fireredasr/models/fireredasr_llm.py ADDED Viewed

	@@ -0,0 +1,272 @@

+import logging
+import os
+import random
+import re
+import torch
+import torch.nn as nn
+from transformers import AutoModelForCausalLM
+from fireredasr.models.fireredasr_aed import FireRedAsrAed
+from fireredasr.models.module.adapter import Adapter
+from fireredasr.tokenizer.llm_tokenizer import DEFAULT_SPEECH_TOKEN, IGNORE_TOKEN_ID
+from fireredasr.tokenizer.llm_tokenizer import LlmTokenizerWrapper
+from fireredasr.utils.param import count_model_parameters
+class FireRedAsrLlm(nn.Module):
+    @classmethod
+    def load_encoder(cls, model_path):
+        assert os.path.exists(model_path)
+        package = torch.load(model_path, map_location=lambda storage, loc: storage)
+        model = FireRedAsrAed.from_args(package["args"])
+        if "model_state_dict" in package:
+            model.load_state_dict(package["model_state_dict"], strict=False)
+        encoder = model.encoder
+        encoder_dim = encoder.odim
+        return encoder, encoder_dim
+    @classmethod
+    def from_args(cls, args):
+        logging.info(args)
+        logging.info("Build FireRedAsrLlm")
+        # Build Speech Encoder
+        encoder, encoder_dim = cls.load_encoder(args.encoder_path)
+        count_model_parameters(encoder)
+        if args.freeze_encoder:
+            logging.info(f"Frezee encoder")
+            for name, param in encoder.named_parameters():
+                param.requires_grad = False
+            encoder.eval()
+        if args.use_flash_attn:
+            attn_implementation = "flash_attention_2"
+            if args.use_fp16:
+                torch_dtype = torch.float16
+            else:
+                torch_dtype = torch.float32
+        else:
+            attn_implementation = "eager"
+            if args.use_fp16:
+                torch_dtype = torch.float16
+            else:
+                torch_dtype = torch.float32
+        # Build LLM
+        llm = AutoModelForCausalLM.from_pretrained(
+            args.llm_dir,
+            attn_implementation=attn_implementation,
+            torch_dtype=torch_dtype,
+        )
+        count_model_parameters(llm)
+        # LLM Freeze or LoRA
+        llm_dim = llm.config.hidden_size
+        if args.freeze_llm:
+            logging.info(f"Frezee LLM")
+            for name, param in llm.named_parameters():
+                param.requires_grad = False
+            llm.eval()
+        else:
+            if args.use_lora:
+                from peft import LoraConfig, get_peft_model
+                lora_config = LoraConfig(
+                    r=64,
+                    lora_alpha=16,
+                    target_modules=[
+                        "q_proj",
+                        "k_proj",
+                        "v_proj",
+                        "o_proj",
+                        "up_proj",
+                        "gate_proj",
+                        "down_proj",
+                    ],
+                    lora_dropout=0.05,
+                    task_type="CAUSAL_LM",
+                )
+                llm = get_peft_model(llm, lora_config)
+                llm.print_trainable_parameters()
+        tokenizer = LlmTokenizerWrapper.build_llm_tokenizer(args.llm_dir)
+        assert tokenizer.pad_token_id == tokenizer.convert_tokens_to_ids("<|endoftext|>")
+        llm.config.pad_token_id = tokenizer.pad_token_id
+        llm.config.bos_token_id = tokenizer.convert_tokens_to_ids("<|im_start|>")
+        llm.config.eos_token_id = tokenizer.convert_tokens_to_ids("<|im_end|>")
+        llm.config.default_speech_token_id = tokenizer.convert_tokens_to_ids(
+            DEFAULT_SPEECH_TOKEN
+        )
+        # Build projector
+        encoder_projector = Adapter(
+            encoder_dim, llm_dim, args.encoder_downsample_rate)
+        count_model_parameters(encoder_projector)
+        return cls(encoder, llm, encoder_projector,
+                   args.freeze_encoder, args.freeze_llm)
+    def __init__(self, encoder, llm, encoder_projector,
+                 freeze_encoder, freeze_llm):
+        super().__init__()
+        self.encoder = encoder
+        self.llm = llm
+        self.encoder_projector = encoder_projector
+        # args
+        self.freeze_encoder = freeze_encoder
+        self.freeze_llm = freeze_llm
+        self.llm_config = llm.config
+    def transcribe(self, padded_feat, feat_lengths, padded_input_ids, attention_mask,
+                   beam_size=1, decode_max_len=0, decode_min_len=0,
+                   repetition_penalty=1.0, llm_length_penalty=1.0, temperature=1.0):
+        encoder_outs, enc_lengths, enc_mask = self.encoder(padded_feat, feat_lengths)
+        speech_features, speech_lens = self.encoder_projector(encoder_outs, enc_lengths)
+        inputs_embeds = self.llm.get_input_embeddings()(padded_input_ids)
+        inputs_embeds, attention_mask, _ = \
+            self._merge_input_ids_with_speech_features(
+                speech_features.to(inputs_embeds.dtype), inputs_embeds, padded_input_ids, attention_mask,
+                speech_lens=speech_lens
+            )
+        max_new_tokens = speech_features.size(1) if decode_max_len < 1 else decode_max_len
+        max_new_tokens = max(1, max_new_tokens)
+        generated_ids = self.llm.generate(
+            inputs_embeds=inputs_embeds,
+            max_new_tokens=max_new_tokens,
+            num_beams=beam_size,
+            do_sample=False,
+            min_length=decode_min_len,
+            top_p=1.0,
+            repetition_penalty=repetition_penalty,
+            length_penalty=llm_length_penalty,
+            temperature=temperature,
+            bos_token_id=self.llm.config.bos_token_id,
+            eos_token_id=self.llm.config.eos_token_id,
+            pad_token_id=self.llm.config.pad_token_id,
+        )
+        return generated_ids
+    def _merge_input_ids_with_speech_features(
+        self, speech_features, inputs_embeds, input_ids, attention_mask, labels=None,
+        speech_lens=None
+    ):
+        """
+        Modified from: https://github.com/k2-fsa/icefall/blob/master/egs/speech_llm/ASR_LLM/whisper_llm_zh/model.py
+        """
+        speech_lens = None
+        num_speechs, speech_len, embed_dim = speech_features.shape
+        batch_size, sequence_length = input_ids.shape
+        left_padding = not torch.sum(
+            input_ids[:, -1] == torch.tensor(self.llm.config.pad_token_id)
+        )
+        # 1. Create a mask to know where special speech tokens are
+        special_speech_token_mask = input_ids == self.llm.config.default_speech_token_id
+        num_special_speech_tokens = torch.sum(special_speech_token_mask, dim=-1)
+        # Compute the maximum embed dimension
+        max_embed_dim = (
+            num_special_speech_tokens.max() * (speech_len - 1)
+        ) + sequence_length
+        batch_indices, non_speech_indices = torch.where(
+            input_ids != self.llm.config.default_speech_token_id
+        )
+        # 2. Compute the positions where text should be written
+        # Calculate new positions for text tokens in merged speech-text sequence.
+        # `special_speech_token_mask` identifies speech tokens. Each speech token will be replaced by `nb_text_tokens_per_speechs - 1` text tokens.
+        # `torch.cumsum` computes how each speech token shifts subsequent text token positions.
+        # - 1 to adjust for zero-based indexing, as `cumsum` inherently increases indices by one.
+        new_token_positions = (
+            torch.cumsum((special_speech_token_mask * (speech_len - 1) + 1), -1) - 1
+        )  # (N,U)
+        nb_speech_pad = max_embed_dim - 1 - new_token_positions[:, -1]
+        if left_padding:
+            new_token_positions += nb_speech_pad[:, None]  # offset for left padding
+        text_to_overwrite = new_token_positions[batch_indices, non_speech_indices]
+        # 3. Create the full embedding, already padded to the maximum position
+        final_embedding = torch.zeros(
+            batch_size,
+            max_embed_dim,
+            embed_dim,
+            dtype=inputs_embeds.dtype,
+            device=inputs_embeds.device,
+        )
+        final_attention_mask = torch.zeros(
+            batch_size,
+            max_embed_dim,
+            dtype=attention_mask.dtype,
+            device=inputs_embeds.device,
+        )
+        if labels is not None:
+            final_labels = torch.full(
+                (batch_size, max_embed_dim),
+                IGNORE_TOKEN_ID,
+                dtype=input_ids.dtype,
+                device=input_ids.device,
+            )
+        # In case the Vision model or the Language model has been offloaded to CPU, we need to manually
+        # set the corresponding tensors into their correct target device.
+        target_device = inputs_embeds.device
+        batch_indices, non_speech_indices, text_to_overwrite = (
+            batch_indices.to(target_device),
+            non_speech_indices.to(target_device),
+            text_to_overwrite.to(target_device),
+        )
+        attention_mask = attention_mask.to(target_device)
+        # 4. Fill the embeddings based on the mask. If we have ["hey" "<speech>", "how", "are"]
+        # we need to index copy on [0, 577, 578, 579] for the text and [1:576] for the speech features
+        final_embedding[batch_indices, text_to_overwrite] = inputs_embeds[
+            batch_indices, non_speech_indices
+        ]
+        final_attention_mask[batch_indices, text_to_overwrite] = attention_mask[
+            batch_indices, non_speech_indices
+        ]
+        if labels is not None:
+            final_labels[batch_indices, text_to_overwrite] = labels[
+                batch_indices, non_speech_indices
+            ]
+        # 5. Fill the embeddings corresponding to the speechs. Anything that is not `text_positions` needs filling (#29835)
+        speech_to_overwrite = torch.full(
+            (batch_size, max_embed_dim),
+            True,
+            dtype=torch.bool,
+            device=inputs_embeds.device,
+        )
+        speech_to_overwrite[batch_indices, text_to_overwrite] = False
+        if speech_lens is not None:
+            speech_pad_position = speech_to_overwrite.cumsum(-1) <= speech_lens[:, None]
+        speech_to_overwrite &= speech_to_overwrite.cumsum(-1) - 1 >= nb_speech_pad[
+            :, None
+        ].to(target_device)
+        if speech_to_overwrite.sum() != speech_features.shape[:-1].numel():
+            raise ValueError(
+                f"The input provided to the model are wrong. The number of speech tokens is {torch.sum(special_speech_token_mask)} while"
+                f" the number of speech given to the model is {num_speechs}. This prevents correct indexing and breaks batch generation."
+            )
+        final_embedding[speech_to_overwrite] = (
+            speech_features.contiguous().reshape(-1, embed_dim).to(target_device)
+        )
+        if speech_lens is not None:
+            speech_to_overwrite &= speech_pad_position
+        final_attention_mask |= speech_to_overwrite
+        # 6. Mask out the embedding at padding positions, as we later use the past_key_value value to determine the non-attended tokens.
+        batch_indices, pad_indices = torch.where(
+            input_ids == self.llm.config.pad_token_id
+        )
+        indices_to_mask = new_token_positions[batch_indices, pad_indices]
+        final_embedding[batch_indices, indices_to_mask] = 0
+        if labels is None:
+            final_labels = None
+        return final_embedding, final_attention_mask, final_labels #, position_ids

fireredasr/fireredasr/models/module/adapter.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import torch
+import torch.nn as nn
+class Adapter(nn.Module):
+    def __init__(self, encoder_dim, llm_dim, downsample_rate=2):
+        super().__init__()
+        self.ds = downsample_rate
+        self.linear1 = nn.Linear(encoder_dim * downsample_rate, llm_dim)
+        self.relu = nn.ReLU()
+        self.linear2 = nn.Linear(llm_dim, llm_dim)
+    def forward(self, x, x_lens):
+        batch_size, seq_len, feat_dim = x.size()
+        num_frames_to_discard = seq_len % self.ds
+        if num_frames_to_discard > 0:
+            x = x[:, :-num_frames_to_discard, :]
+        seq_len = x.size(1)
+        x = x.contiguous()
+        x = x.view(
+            batch_size, seq_len // self.ds, feat_dim * self.ds
+        )
+        x = self.linear1(x)
+        x = self.relu(x)
+        x = self.linear2(x)
+        new_x_lens = torch.clamp(x_lens, max=seq_len) // self.ds
+        return x, new_x_lens

fireredasr/fireredasr/models/module/conformer_encoder.py ADDED Viewed

	@@ -0,0 +1,322 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class ConformerEncoder(nn.Module):
+    def __init__(self, idim, n_layers, n_head, d_model,
+                 residual_dropout=0.1, dropout_rate=0.1, kernel_size=33,
+                 pe_maxlen=5000):
+        super().__init__()
+        self.odim = d_model
+        self.input_preprocessor = Conv2dSubsampling(idim, d_model)
+        self.positional_encoding = RelPositionalEncoding(d_model)
+        self.dropout = nn.Dropout(residual_dropout)
+        self.layer_stack = nn.ModuleList()
+        for l in range(n_layers):
+            block = RelPosEmbConformerBlock(d_model, n_head,
+                        residual_dropout,
+                        dropout_rate, kernel_size)
+            self.layer_stack.append(block)
+    def forward(self, padded_input, input_lengths, pad=True):
+        if pad:
+            padded_input = F.pad(padded_input,
+                (0, 0, 0, self.input_preprocessor.context - 1), 'constant', 0.0)
+        src_mask = self.padding_position_is_0(padded_input, input_lengths)
+        embed_output, input_lengths, src_mask = self.input_preprocessor(padded_input, src_mask)
+        enc_output = self.dropout(embed_output)
+        pos_emb = self.dropout(self.positional_encoding(embed_output))
+        enc_outputs = []
+        for enc_layer in self.layer_stack:
+            enc_output = enc_layer(enc_output, pos_emb, slf_attn_mask=src_mask,
+                                   pad_mask=src_mask)
+            enc_outputs.append(enc_output)
+        return enc_output, input_lengths, src_mask
+    def padding_position_is_0(self, padded_input, input_lengths):
+        N, T = padded_input.size()[:2]
+        mask = torch.ones((N, T)).to(padded_input.device)
+        for i in range(N):
+            mask[i, input_lengths[i]:] = 0
+        mask = mask.unsqueeze(dim=1)
+        return mask.to(torch.uint8)
+class RelPosEmbConformerBlock(nn.Module):
+    def __init__(self, d_model, n_head,
+                 residual_dropout=0.1,
+                 dropout_rate=0.1, kernel_size=33):
+        super().__init__()
+        self.ffn1 = ConformerFeedForward(d_model, dropout_rate)
+        self.mhsa = RelPosMultiHeadAttention(n_head, d_model,
+                                             residual_dropout)
+        self.conv = ConformerConvolution(d_model, kernel_size,
+                                         dropout_rate)
+        self.ffn2 = ConformerFeedForward(d_model, dropout_rate)
+        self.layer_norm = nn.LayerNorm(d_model)
+    def forward(self, x, pos_emb, slf_attn_mask=None, pad_mask=None):
+        out = 0.5 * x + 0.5 * self.ffn1(x)
+        out = self.mhsa(out, out, out, pos_emb, mask=slf_attn_mask)[0]
+        out = self.conv(out, pad_mask)
+        out = 0.5 * out + 0.5 * self.ffn2(out)
+        out = self.layer_norm(out)
+        return out
+class Swish(nn.Module):
+    def forward(self, x):
+        return x * torch.sigmoid(x)
+class Conv2dSubsampling(nn.Module):
+    def __init__(self, idim, d_model, out_channels=32):
+        super().__init__()
+        self.conv = nn.Sequential(
+            nn.Conv2d(1, out_channels, 3, 2),
+            nn.ReLU(),
+            nn.Conv2d(out_channels, out_channels, 3, 2),
+            nn.ReLU(),
+        )
+        subsample_idim = ((idim - 1) // 2 - 1) // 2
+        self.out = nn.Linear(out_channels * subsample_idim, d_model)
+        self.subsampling = 4
+        left_context = right_context = 3  # both exclude currect frame
+        self.context = left_context + 1 + right_context  # 7
+    def forward(self, x, x_mask):
+        x = x.unsqueeze(1)
+        x = self.conv(x)
+        N, C, T, D = x.size()
+        x = self.out(x.transpose(1, 2).contiguous().view(N, T, C * D))
+        mask = x_mask[:, :, :-2:2][:, :, :-2:2]
+        input_lengths = mask[:, -1, :].sum(dim=-1)
+        return x, input_lengths, mask
+class RelPositionalEncoding(torch.nn.Module):
+    def __init__(self, d_model, max_len=5000):
+        super().__init__()
+        pe_positive = torch.zeros(max_len, d_model, requires_grad=False)
+        pe_negative = torch.zeros(max_len, d_model, requires_grad=False)
+        position = torch.arange(0, max_len).unsqueeze(1).float()
+        div_term = torch.exp(torch.arange(0, d_model, 2).float() *
+                             -(torch.log(torch.tensor(10000.0)).item()/d_model))
+        pe_positive[:, 0::2] = torch.sin(position * div_term)
+        pe_positive[:, 1::2] = torch.cos(position * div_term)
+        pe_negative[:, 0::2] = torch.sin(-1 * position * div_term)
+        pe_negative[:, 1::2] = torch.cos(-1 * position * div_term)
+        pe_positive = torch.flip(pe_positive, [0]).unsqueeze(0)
+        pe_negative = pe_negative[1:].unsqueeze(0)
+        pe = torch.cat([pe_positive, pe_negative], dim=1)
+        self.register_buffer('pe', pe)
+    def forward(self, x):
+        # Tmax = 2 * max_len - 1
+        Tmax, T = self.pe.size(1), x.size(1)
+        pos_emb = self.pe[:, Tmax // 2 - T + 1 : Tmax // 2 + T].clone().detach()
+        return pos_emb
+class ConformerFeedForward(nn.Module):
+    def __init__(self, d_model, dropout_rate=0.1):
+        super().__init__()
+        pre_layer_norm = nn.LayerNorm(d_model)
+        linear_expand = nn.Linear(d_model, d_model*4)
+        nonlinear = Swish()
+        dropout_pre = nn.Dropout(dropout_rate)
+        linear_project = nn.Linear(d_model*4, d_model)
+        dropout_post = nn.Dropout(dropout_rate)
+        self.net = nn.Sequential(pre_layer_norm,
+                                 linear_expand,
+                                 nonlinear,
+                                 dropout_pre,
+                                 linear_project,
+                                 dropout_post)
+    def forward(self, x):
+        residual = x
+        output = self.net(x)
+        output = output + residual
+        return output
+class ConformerConvolution(nn.Module):
+    def __init__(self, d_model, kernel_size=33, dropout_rate=0.1):
+        super().__init__()
+        assert kernel_size % 2 == 1
+        self.pre_layer_norm = nn.LayerNorm(d_model)
+        self.pointwise_conv1 = nn.Conv1d(d_model, d_model*4, kernel_size=1, bias=False)
+        self.glu = F.glu
+        self.padding = (kernel_size - 1) // 2
+        self.depthwise_conv = nn.Conv1d(d_model*2, d_model*2,
+                                        kernel_size, stride=1,
+                                        padding=self.padding,
+                                        groups=d_model*2, bias=False)
+        self.batch_norm = nn.LayerNorm(d_model*2)
+        self.swish = Swish()
+        self.pointwise_conv2 = nn.Conv1d(d_model*2, d_model, kernel_size=1, bias=False)
+        self.dropout = nn.Dropout(dropout_rate)
+    def forward(self, x, mask=None):
+        residual = x
+        out = self.pre_layer_norm(x)
+        out = out.transpose(1, 2)
+        if mask is not None:
+            out.masked_fill_(mask.ne(1), 0.0)
+        out = self.pointwise_conv1(out)
+        out = F.glu(out, dim=1)
+        out = self.depthwise_conv(out)
+        out = out.transpose(1, 2)
+        out = self.swish(self.batch_norm(out))
+        out = out.transpose(1, 2)
+        out = self.dropout(self.pointwise_conv2(out))
+        if mask is not None:
+            out.masked_fill_(mask.ne(1), 0.0)
+        out = out.transpose(1, 2)
+        return out + residual
+class EncoderMultiHeadAttention(nn.Module):
+    def __init__(self, n_head, d_model,
+                 residual_dropout=0.1):
+        super().__init__()
+        assert d_model % n_head == 0
+        self.n_head = n_head
+        self.d_k = d_model // n_head
+        self.d_v = self.d_k
+        self.w_qs = nn.Linear(d_model, n_head * self.d_k, bias=False)
+        self.w_ks = nn.Linear(d_model, n_head * self.d_k, bias=False)
+        self.w_vs = nn.Linear(d_model, n_head * self.d_v, bias=False)
+        self.layer_norm_q = nn.LayerNorm(d_model)
+        self.layer_norm_k = nn.LayerNorm(d_model)
+        self.layer_norm_v = nn.LayerNorm(d_model)
+        self.attention = ScaledDotProductAttention(temperature=self.d_k ** 0.5)
+        self.fc = nn.Linear(n_head * self.d_v, d_model, bias=False)
+        self.dropout = nn.Dropout(residual_dropout)
+    def forward(self, q, k, v, mask=None):
+        sz_b, len_q = q.size(0), q.size(1)
+        residual = q
+        q, k, v = self.forward_qkv(q, k, v)
+        output, attn = self.attention(q, k, v, mask=mask)
+        output = self.forward_output(output, residual, sz_b, len_q)
+        return output, attn
+    def forward_qkv(self, q, k, v):
+        d_k, d_v, n_head = self.d_k, self.d_v, self.n_head
+        sz_b, len_q, len_k, len_v = q.size(0), q.size(1), k.size(1), v.size(1)
+        q = self.layer_norm_q(q)
+        k = self.layer_norm_k(k)
+        v = self.layer_norm_v(v)
+        q = self.w_qs(q).view(sz_b, len_q, n_head, d_k)
+        k = self.w_ks(k).view(sz_b, len_k, n_head, d_k)
+        v = self.w_vs(v).view(sz_b, len_v, n_head, d_v)
+        q = q.transpose(1, 2)
+        k = k.transpose(1, 2)
+        v = v.transpose(1, 2)
+        return q, k, v
+    def forward_output(self, output, residual, sz_b, len_q):
+        output = output.transpose(1, 2).contiguous().view(sz_b, len_q, -1)
+        fc_out = self.fc(output)
+        output = self.dropout(fc_out)
+        output = output + residual
+        return output
+class ScaledDotProductAttention(nn.Module):
+    def __init__(self, temperature):
+        super().__init__()
+        self.temperature = temperature
+        self.dropout = nn.Dropout(0.0)
+        self.INF = float('inf')
+    def forward(self, q, k, v, mask=None):
+        attn = torch.matmul(q, k.transpose(2, 3)) / self.temperature
+        output, attn = self.forward_attention(attn, v, mask)
+        return output, attn
+    def forward_attention(self, attn, v, mask=None):
+        if mask is not None:
+            mask = mask.unsqueeze(1)
+            mask = mask.eq(0)
+            attn = attn.masked_fill(mask, -self.INF)
+            attn = torch.softmax(attn, dim=-1).masked_fill(mask, 0.0)
+        else:
+            attn = torch.softmax(attn, dim=-1)
+        d_attn = self.dropout(attn)
+        output = torch.matmul(d_attn, v)
+        return output, attn
+class RelPosMultiHeadAttention(EncoderMultiHeadAttention):
+    def __init__(self, n_head, d_model,
+                 residual_dropout=0.1):
+        super().__init__(n_head, d_model,
+                         residual_dropout)
+        d_k = d_model // n_head
+        self.scale = 1.0 / (d_k ** 0.5)
+        self.linear_pos = nn.Linear(d_model, n_head * d_k, bias=False)
+        self.pos_bias_u = nn.Parameter(torch.FloatTensor(n_head, d_k))
+        self.pos_bias_v = nn.Parameter(torch.FloatTensor(n_head, d_k))
+        torch.nn.init.xavier_uniform_(self.pos_bias_u)
+        torch.nn.init.xavier_uniform_(self.pos_bias_v)
+    def _rel_shift(self, x):
+        N, H, T1, T2 = x.size()
+        zero_pad = torch.zeros((N, H, T1, 1), device=x.device, dtype=x.dtype)
+        x_padded = torch.cat([zero_pad, x], dim=-1)
+        x_padded = x_padded.view(N, H, T2 + 1, T1)
+        x = x_padded[:, :, 1:].view_as(x)
+        x = x[:, :, :, : x.size(-1) // 2 + 1]
+        return x
+    def forward(self, q, k, v, pos_emb, mask=None):
+        sz_b, len_q = q.size(0), q.size(1)
+        residual = q
+        q, k, v = self.forward_qkv(q, k, v)
+        q = q.transpose(1, 2)
+        n_batch_pos = pos_emb.size(0)
+        p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.n_head, self.d_k)
+        p = p.transpose(1, 2)
+        q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2)
+        q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2)
+        matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1))
+        matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1))
+        matrix_bd = self._rel_shift(matrix_bd)
+        attn_scores = matrix_ac + matrix_bd
+        attn_scores.mul_(self.scale)
+        output, attn = self.attention.forward_attention(attn_scores, v, mask=mask)
+        output = self.forward_output(output, residual, sz_b, len_q)
+        return output, attn

fireredasr/fireredasr/models/module/transformer_decoder.py ADDED Viewed

	@@ -0,0 +1,299 @@

+from typing import List, Optional, Dict
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+class TransformerDecoder(nn.Module):
+    def __init__(
+            self, sos_id, eos_id, pad_id, odim,
+            n_layers, n_head, d_model,
+            residual_dropout=0.1, pe_maxlen=5000):
+        super().__init__()
+        self.INF = 1e10
+        # parameters
+        self.pad_id = pad_id
+        self.sos_id = sos_id
+        self.eos_id = eos_id
+        self.n_layers = n_layers
+        # Components
+        self.tgt_word_emb = nn.Embedding(odim, d_model, padding_idx=self.pad_id)
+        self.positional_encoding = PositionalEncoding(d_model, max_len=pe_maxlen)
+        self.dropout = nn.Dropout(residual_dropout)
+        self.layer_stack = nn.ModuleList()
+        for l in range(n_layers):
+            block = DecoderLayer(d_model, n_head, residual_dropout)
+            self.layer_stack.append(block)
+        self.tgt_word_prj = nn.Linear(d_model, odim, bias=False)
+        self.layer_norm_out = nn.LayerNorm(d_model)
+        self.tgt_word_prj.weight = self.tgt_word_emb.weight
+        self.scale = (d_model ** 0.5)
+    def batch_beam_search(self, encoder_outputs, src_masks,
+                   beam_size=1, nbest=1, decode_max_len=0,
+                   softmax_smoothing=1.0, length_penalty=0.0, eos_penalty=1.0):
+        B = beam_size
+        N, Ti, H = encoder_outputs.size()
+        device = encoder_outputs.device
+        maxlen = decode_max_len if decode_max_len > 0 else Ti
+        assert eos_penalty > 0.0 and eos_penalty <= 1.0
+        # Init
+        encoder_outputs = encoder_outputs.unsqueeze(1).repeat(1, B, 1, 1).view(N*B, Ti, H)
+        src_mask = src_masks.unsqueeze(1).repeat(1, B, 1, 1).view(N*B, -1, Ti)
+        ys = torch.ones(N*B, 1).fill_(self.sos_id).long().to(device)
+        caches: List[Optional[Tensor]] = []
+        for _ in range(self.n_layers):
+            caches.append(None)
+        scores = torch.tensor([0.0] + [-self.INF]*(B-1)).float().to(device)
+        scores = scores.repeat(N).view(N*B, 1)
+        is_finished = torch.zeros_like(scores)
+        # Autoregressive Prediction
+        for t in range(maxlen):
+            tgt_mask = self.ignored_target_position_is_0(ys, self.pad_id)
+            dec_output = self.dropout(
+                self.tgt_word_emb(ys) * self.scale +
+                self.positional_encoding(ys))
+            i = 0
+            for dec_layer in self.layer_stack:
+                dec_output = dec_layer.forward(
+                    dec_output, encoder_outputs,
+                    tgt_mask, src_mask,
+                    cache=caches[i])
+                caches[i] = dec_output
+                i += 1
+            dec_output = self.layer_norm_out(dec_output)
+            t_logit = self.tgt_word_prj(dec_output[:, -1])
+            t_scores = F.log_softmax(t_logit / softmax_smoothing, dim=-1)
+            if eos_penalty != 1.0:
+                t_scores[:, self.eos_id] *= eos_penalty
+            t_topB_scores, t_topB_ys = torch.topk(t_scores, k=B, dim=1)
+            t_topB_scores = self.set_finished_beam_score_to_zero(t_topB_scores, is_finished)
+            t_topB_ys = self.set_finished_beam_y_to_eos(t_topB_ys, is_finished)
+            # Accumulated
+            scores = scores + t_topB_scores
+            # Pruning
+            scores = scores.view(N, B*B)
+            scores, topB_score_ids = torch.topk(scores, k=B, dim=1)
+            scores = scores.view(-1, 1)
+            topB_row_number_in_each_B_rows_of_ys = torch.div(topB_score_ids, B).view(N*B)
+            stride = B * torch.arange(N).view(N, 1).repeat(1, B).view(N*B).to(device)
+            topB_row_number_in_ys = topB_row_number_in_each_B_rows_of_ys.long() + stride.long()
+            # Update ys
+            ys = ys[topB_row_number_in_ys]
+            t_ys = torch.gather(t_topB_ys.view(N, B*B), dim=1, index=topB_score_ids).view(N*B, 1)
+            ys = torch.cat((ys, t_ys), dim=1)
+            # Update caches
+            new_caches: List[Optional[Tensor]] = []
+            for cache in caches:
+                if cache is not None:
+                    new_caches.append(cache[topB_row_number_in_ys])
+            caches = new_caches
+            # Update finished state
+            is_finished = t_ys.eq(self.eos_id)
+            if is_finished.sum().item() == N*B:
+                break
+        # Length penalty (follow GNMT)
+        scores = scores.view(N, B)
+        ys = ys.view(N, B, -1)
+        ys_lengths = self.get_ys_lengths(ys)
+        if length_penalty > 0.0:
+            penalty = torch.pow((5+ys_lengths.float())/(5.0+1), length_penalty)
+            scores /= penalty
+        nbest_scores, nbest_ids = torch.topk(scores, k=int(nbest), dim=1)
+        nbest_scores = -1.0 * nbest_scores
+        index = nbest_ids + B * torch.arange(N).view(N, 1).to(device).long()
+        nbest_ys = ys.view(N*B, -1)[index.view(-1)]
+        nbest_ys = nbest_ys.view(N, nbest_ids.size(1), -1)
+        nbest_ys_lengths = ys_lengths.view(N*B)[index.view(-1)].view(N, -1)
+        # result
+        nbest_hyps: List[List[Dict[str, Tensor]]] = []
+        for n in range(N):
+            n_nbest_hyps: List[Dict[str, Tensor]] = []
+            for i, score in enumerate(nbest_scores[n]):
+                new_hyp = {
+                    "yseq": nbest_ys[n, i, 1:nbest_ys_lengths[n, i]]
+                }
+                n_nbest_hyps.append(new_hyp)
+            nbest_hyps.append(n_nbest_hyps)
+        return nbest_hyps
+    def ignored_target_position_is_0(self, padded_targets, ignore_id):
+        mask = torch.ne(padded_targets, ignore_id)
+        mask = mask.unsqueeze(dim=1)
+        T = padded_targets.size(-1)
+        upper_tri_0_mask = self.upper_triangular_is_0(T).unsqueeze(0).to(mask.dtype)
+        upper_tri_0_mask = upper_tri_0_mask.to(mask.dtype).to(mask.device)
+        return mask.to(torch.uint8) & upper_tri_0_mask.to(torch.uint8)
+    def upper_triangular_is_0(self, size):
+        ones = torch.ones(size, size)
+        tri_left_ones = torch.tril(ones)
+        return tri_left_ones.to(torch.uint8)
+    def set_finished_beam_score_to_zero(self, scores, is_finished):
+        NB, B = scores.size()
+        is_finished = is_finished.float()
+        mask_score = torch.tensor([0.0] + [-self.INF]*(B-1)).float().to(scores.device)
+        mask_score = mask_score.view(1, B).repeat(NB, 1)
+        return scores * (1 - is_finished) + mask_score * is_finished
+    def set_finished_beam_y_to_eos(self, ys, is_finished):
+        is_finished = is_finished.long()
+        return ys * (1 - is_finished) + self.eos_id * is_finished
+    def get_ys_lengths(self, ys):
+        N, B, Tmax = ys.size()
+        ys_lengths = torch.sum(torch.ne(ys, self.eos_id), dim=-1)
+        return ys_lengths.int()
+class DecoderLayer(nn.Module):
+    def __init__(self, d_model, n_head, dropout):
+        super().__init__()
+        self.self_attn_norm = nn.LayerNorm(d_model)
+        self.self_attn = DecoderMultiHeadAttention(d_model, n_head, dropout)
+        self.cross_attn_norm = nn.LayerNorm(d_model)
+        self.cross_attn = DecoderMultiHeadAttention(d_model, n_head, dropout)
+        self.mlp_norm = nn.LayerNorm(d_model)
+        self.mlp = PositionwiseFeedForward(d_model, d_model*4, dropout)
+    def forward(self, dec_input, enc_output, self_attn_mask, cross_attn_mask,
+                cache=None):
+        x = dec_input
+        residual = x
+        x = self.self_attn_norm(x)
+        if cache is not None:
+            xq = x[:, -1:, :]
+            residual = residual[:, -1:, :]
+            self_attn_mask = self_attn_mask[:, -1:, :]
+        else:
+            xq = x
+        x = self.self_attn(xq, x, x, mask=self_attn_mask)
+        x = residual + x
+        residual = x
+        x = self.cross_attn_norm(x)
+        x = self.cross_attn(x, enc_output, enc_output, mask=cross_attn_mask)
+        x = residual + x
+        residual = x
+        x = self.mlp_norm(x)
+        x = residual + self.mlp(x)
+        if cache is not None:
+            x = torch.cat([cache, x], dim=1)
+        return x
+class DecoderMultiHeadAttention(nn.Module):
+    def __init__(self, d_model, n_head, dropout=0.1):
+        super().__init__()
+        self.d_model = d_model
+        self.n_head = n_head
+        self.d_k = d_model // n_head
+        self.w_qs = nn.Linear(d_model, n_head * self.d_k)
+        self.w_ks = nn.Linear(d_model, n_head * self.d_k, bias=False)
+        self.w_vs = nn.Linear(d_model, n_head * self.d_k)
+        self.attention = DecoderScaledDotProductAttention(
+            temperature=self.d_k ** 0.5)
+        self.fc = nn.Linear(n_head * self.d_k, d_model)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, q, k, v, mask=None):
+        bs = q.size(0)
+        q = self.w_qs(q).view(bs, -1, self.n_head, self.d_k)
+        k = self.w_ks(k).view(bs, -1, self.n_head, self.d_k)
+        v = self.w_vs(v).view(bs, -1, self.n_head, self.d_k)
+        q = q.transpose(1, 2)
+        k = k.transpose(1, 2)
+        v = v.transpose(1, 2)
+        if mask is not None:
+            mask = mask.unsqueeze(1)
+        output = self.attention(q, k, v, mask=mask)
+        output = output.transpose(1, 2).contiguous().view(bs, -1, self.d_model)
+        output = self.fc(output)
+        output = self.dropout(output)
+        return output
+class DecoderScaledDotProductAttention(nn.Module):
+    def __init__(self, temperature):
+        super().__init__()
+        self.temperature = temperature
+        self.INF = float("inf")
+    def forward(self, q, k, v, mask=None):
+        attn = torch.matmul(q, k.transpose(2, 3)) / self.temperature
+        if mask is not None:
+            mask = mask.eq(0)
+            attn = attn.masked_fill(mask, -self.INF)
+            attn = torch.softmax(attn, dim=-1).masked_fill(mask, 0.0)
+        else:
+            attn = torch.softmax(attn, dim=-1)
+        output = torch.matmul(attn, v)
+        return output
+class PositionwiseFeedForward(nn.Module):
+    def __init__(self, d_model, d_ff, dropout=0.1):
+        super().__init__()
+        self.w_1 = nn.Linear(d_model, d_ff)
+        self.act = nn.GELU()
+        self.w_2 = nn.Linear(d_ff, d_model)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x):
+        output = self.w_2(self.act(self.w_1(x)))
+        output = self.dropout(output)
+        return output
+class PositionalEncoding(nn.Module):
+    def __init__(self, d_model, max_len=5000):
+        super().__init__()
+        assert d_model % 2 == 0
+        pe = torch.zeros(max_len, d_model, requires_grad=False)
+        position = torch.arange(0, max_len).unsqueeze(1).float()
+        div_term = torch.exp(torch.arange(0, d_model, 2).float() *
+                             -(torch.log(torch.tensor(10000.0)).item()/d_model))
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0)
+        self.register_buffer('pe', pe)
+    def forward(self, x):
+        length = x.size(1)
+        return self.pe[:, :length].clone().detach()

fireredasr/fireredasr/speech2text.py ADDED Viewed

	@@ -0,0 +1,105 @@

+#!/usr/bin/env python3
+import argparse
+import glob
+import os
+import sys
+from fireredasr.models.fireredasr import FireRedAsr
+parser = argparse.ArgumentParser()
+parser.add_argument('--asr_type', type=str, required=True, choices=["aed", "llm"])
+parser.add_argument('--model_dir', type=str, required=True)
+# Input / Output
+parser.add_argument("--wav_path", type=str)
+parser.add_argument("--wav_paths", type=str, nargs="*")
+parser.add_argument("--wav_dir", type=str)
+parser.add_argument("--wav_scp", type=str)
+parser.add_argument("--output", type=str)
+# Decode Options
+parser.add_argument('--use_gpu', type=int, default=1)
+parser.add_argument("--batch_size", type=int, default=1)
+parser.add_argument("--beam_size", type=int, default=1)
+parser.add_argument("--decode_max_len", type=int, default=0)
+# FireRedASR-AED
+parser.add_argument("--nbest", type=int, default=1)
+parser.add_argument("--softmax_smoothing", type=float, default=1.0)
+parser.add_argument("--aed_length_penalty", type=float, default=0.0)
+parser.add_argument("--eos_penalty", type=float, default=1.0)
+# FireRedASR-LLM
+parser.add_argument("--decode_min_len", type=int, default=0)
+parser.add_argument("--repetition_penalty", type=float, default=1.0)
+parser.add_argument("--llm_length_penalty", type=float, default=0.0)
+parser.add_argument("--temperature", type=float, default=1.0)
+def main(args):
+    wavs = get_wav_info(args)
+    fout = open(args.output, "w") if args.output else None
+    model = FireRedAsr.from_pretrained(args.asr_type, args.model_dir)
+    batch_uttid = []
+    batch_wav_path = []
+    for i, wav in enumerate(wavs):
+        uttid, wav_path = wav
+        batch_uttid.append(uttid)
+        batch_wav_path.append(wav_path)
+        if len(batch_wav_path) < args.batch_size and i != len(wavs) - 1:
+            continue
+        results = model.transcribe(
+            batch_uttid,
+            batch_wav_path,
+            {
+            "use_gpu": args.use_gpu,
+            "beam_size": args.beam_size,
+            "nbest": args.nbest,
+            "decode_max_len": args.decode_max_len,
+            "softmax_smoothing": args.softmax_smoothing,
+            "aed_length_penalty": args.aed_length_penalty,
+            "eos_penalty": args.eos_penalty,
+            "decode_min_len": args.decode_min_len,
+            "repetition_penalty": args.repetition_penalty,
+            "llm_length_penalty": args.llm_length_penalty,
+            "temperature": args.temperature
+            }
+        )
+        for result in results:
+            print(result)
+            if fout is not None:
+                fout.write(f"{result['uttid']}\t{result['text']}\n")
+        batch_uttid = []
+        batch_wav_path = []
+def get_wav_info(args):
+    """
+    Returns:
+        wavs: list of (uttid, wav_path)
+    """
+    base = lambda p: os.path.basename(p).replace(".wav", "")
+    if args.wav_path:
+        wavs = [(base(args.wav_path), args.wav_path)]
+    elif args.wav_paths and len(args.wav_paths) >= 1:
+        wavs = [(base(p), p) for p in sorted(args.wav_paths)]
+    elif args.wav_scp:
+        wavs = [line.strip().split() for line in open(args.wav_scp)]
+    elif args.wav_dir:
+        wavs = glob.glob(f"{args.wav_dir}/**/*.wav", recursive=True)
+        wavs = [(base(p), p) for p in sorted(wavs)]
+    else:
+        raise ValueError("Please provide valid wav info")
+    print(f"#wavs={len(wavs)}")
+    return wavs
+if __name__ == "__main__":
+    args = parser.parse_args()
+    print(args)
+    main(args)

fireredasr/fireredasr/tokenizer/aed_tokenizer.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import logging
+import re
+import sentencepiece as spm
+from fireredasr.data.token_dict import TokenDict
+class ChineseCharEnglishSpmTokenizer:
+    """
+    - One Chinese char is a token.
+    - Split English word into SPM and one piece is a token.
+    - Ignore ' ' between Chinese char
+    - Replace ' ' between English word with "▁" by spm_model
+    - Need to put SPM piece into dict file
+    - If not set spm_model, will use English char and <space>
+    """
+    SPM_SPACE = "▁"
+    def __init__(self, dict_path, spm_model, unk="<unk>", space="<space>"):
+        self.dict = TokenDict(dict_path, unk=unk)
+        self.space = space
+        if spm_model:
+            self.sp = spm.SentencePieceProcessor()
+            self.sp.Load(spm_model)
+        else:
+            self.sp = None
+            print("[WRAN] Not set spm_model, will use English char")
+            print("[WARN] Please check how to deal with ' '(space)")
+            if self.space not in self.dict:
+                print("Please add <space> to your dict, or it will be <unk>")
+    def tokenize(self, text, replace_punc=True):
+        #if text == "":
+        #    logging.info(f"empty text")
+        text = text.upper()
+        tokens = []
+        if replace_punc:
+            text = re.sub("[，。？！,\.?!]", " ", text)
+        pattern = re.compile(r'([\u3400-\u4dbf\u4e00-\u9fff])')
+        parts = pattern.split(text.strip())
+        parts = [p for p in parts if len(p.strip()) > 0]
+        for part in parts:
+            if pattern.fullmatch(part) is not None:
+                tokens.append(part)
+            else:
+                if self.sp:
+                    for piece in self.sp.EncodeAsPieces(part.strip()):
+                        tokens.append(piece)
+                else:
+                    for char in part.strip():
+                        tokens.append(char if char != " " else self.space)
+        tokens_id = []
+        for token in tokens:
+            tokens_id.append(self.dict.get(token, self.dict.unk))
+        return tokens, tokens_id
+    def detokenize(self, inputs, join_symbol="", replace_spm_space=True):
+        """inputs is ids or tokens, do not need self.sp"""
+        if len(inputs) > 0 and type(inputs[0]) == int:
+            tokens = [self.dict[id] for id in inputs]
+        else:
+            tokens = inputs
+        s = f"{join_symbol}".join(tokens)
+        if replace_spm_space:
+            s = s.replace(self.SPM_SPACE, ' ').strip()
+        return s

fireredasr/fireredasr/tokenizer/llm_tokenizer.py ADDED Viewed

	@@ -0,0 +1,105 @@

+import re
+import torch
+from transformers import AutoTokenizer
+from transformers.trainer_pt_utils import LabelSmoother
+DEFAULT_SPEECH_TOKEN = "<speech>"
+IGNORE_TOKEN_ID = LabelSmoother.ignore_index
+class LlmTokenizerWrapper:
+    @classmethod
+    def build_llm_tokenizer(cls, llm_path, use_flash_attn=False):
+        tokenizer = AutoTokenizer.from_pretrained(llm_path)
+        if use_flash_attn:
+            tokenizer.padding_side = "left"
+        else:
+            tokenizer.padding_side = "right"
+        special_tokens_dict = {"additional_special_tokens": [DEFAULT_SPEECH_TOKEN]}
+        tokenizer.add_special_tokens(special_tokens_dict)
+        return tokenizer
+    @classmethod
+    def clean_text(cls, origin_text):
+        """remove punc, remove space between Chinese and keep space between English"""
+        # remove punc
+        text = re.sub("[，。？！,\.!?《》（）\·“”、\\/]", "", origin_text)
+        # merge space
+        text = re.sub("\s+", " ", text)
+        # remove space between Chinese and keep space between English
+        pattern = re.compile(r'([\u3400-\u4dbf\u4e00-\u9fff])')  # Chinese
+        parts = pattern.split(text.strip())
+        parts = [p for p in parts if len(p.strip()) > 0]
+        text = "".join(parts)
+        text = text.strip()
+        text = text.lower()
+        return text
+    @classmethod
+    def preprocess_texts(cls, origin_texts, tokenizer, max_len, decode=False):
+        messages = []
+        clean_texts = []
+        for i, origin_text in enumerate(origin_texts):
+            text = cls.clean_text(origin_text)
+            clean_texts.append(text)
+            text = text if not decode else ""
+            message = [
+                {"role": "user", "content": f"{DEFAULT_SPEECH_TOKEN}请转写音频为文字"},
+                {"role": "assistant", "content": text},
+            ]
+            messages.append(message)
+        texts = []
+        if not decode:
+            TEMPLATE = "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if loop.last %}{{ '<|im_end|>'}}{% else %}{{ '<|im_end|>\n' }}{% endif %}{% endfor %}"
+        else:
+            TEMPLATE = "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if loop.last %}{{''}}{% else %}{{ '<|im_end|>\n' }}{% endif %}{% endfor %}"
+        for i, msg in enumerate(messages):
+            texts.append(
+                tokenizer.apply_chat_template(
+                    msg,
+                    tokenize=True,
+                    chat_template=TEMPLATE,
+                    add_generation_prompt=False,
+                    padding="longest",
+                    max_length=max_len,
+                    truncation=True,
+                )
+            )
+        # Padding texts
+        max_len_texts = max([len(text) for text in texts])
+        if tokenizer.padding_side == "right":
+            texts = [
+                text + [tokenizer.pad_token_id] * (max_len_texts - len(text))
+                for text in texts
+            ]
+        else:
+            texts = [
+                [tokenizer.pad_token_id] * (max_len_texts - len(text)) + text
+                for text in texts
+            ]
+        input_ids = torch.tensor(texts, dtype=torch.int)
+        target_ids = input_ids.clone()
+        target_ids[target_ids == tokenizer.pad_token_id] = IGNORE_TOKEN_ID
+        # first get the indices of the tokens
+        mask_prompt = True
+        if mask_prompt:
+            mask_indices = torch.where(
+                input_ids == tokenizer.convert_tokens_to_ids("assistant")
+                )
+            for i in range(mask_indices[0].size(0)):
+                row = mask_indices[0][i]
+                col = mask_indices[1][i]
+                target_ids[row, : col + 2] = IGNORE_TOKEN_ID
+        attention_mask = input_ids.ne(tokenizer.pad_token_id)
+        target_ids = target_ids.type(torch.LongTensor)
+        input_ids = input_ids.type(torch.LongTensor)
+        return input_ids, attention_mask, target_ids, clean_texts

fireredasr/fireredasr/utils/param.py ADDED Viewed

	@@ -0,0 +1,13 @@

+import logging
+import torch
+def count_model_parameters(model):
+    if not isinstance(model, torch.nn.Module):
+        return 0, 0
+    name = f"{model.__class__.__name__} {model.__class__}"
+    num = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    size = num * 4.0 / 1024.0 / 1024.0 # float32, MB
+    logging.info(f"#param of {name} is {num} = {size:.1f} MB (float32)")
+    return num, size

fireredasr/fireredasr/utils/wer.py ADDED Viewed

	@@ -0,0 +1,303 @@

+#!/usr/bin/env python3
+import argparse
+import re
+from collections import OrderedDict
+parser = argparse.ArgumentParser()
+parser.add_argument("--ref", type=str, required=True)
+parser.add_argument("--hyp", type=str, required=True)
+parser.add_argument("--print_sentence_wer", type=int, default=0)
+parser.add_argument("--do_tn", type=int, default=0, help="simple tn by cn2an")
+parser.add_argument("--rm_special", type=int, default=0, help="remove <\|.*?\|>")
+def main(args):
+    uttid2refs = read_uttid2tokens(args.ref, args.do_tn, args.rm_special)
+    uttid2hyps = read_uttid2tokens(args.hyp, args.do_tn, args.rm_special)
+    uttid2wer_info, wer_stat, en_dig_stat = compute_uttid2wer_info(
+        uttid2refs, uttid2hyps, args.print_sentence_wer)
+    wer_stat.print()
+    en_dig_stat.print()
+def read_uttid2tokens(filename, do_tn=False, rm_special=False):
+    print(f">>> Read uttid to tokens: {filename}", flush=True)
+    uttid2tokens = OrderedDict()
+    uttid2text = read_uttid2text(filename, do_tn, rm_special)
+    for uttid, text in uttid2text.items():
+        tokens = text2tokens(text)
+        uttid2tokens[uttid] = tokens
+    return uttid2tokens
+def read_uttid2text(filename, do_tn=False, rm_special=False):
+    uttid2text = OrderedDict()
+    with open(filename, "r", encoding="utf8") as fin:
+        for i, line in enumerate(fin):
+            cols = line.split()
+            if len(cols) == 0:
+                print("[WARN] empty line, continue", i, flush=True)
+                continue
+            assert cols[0] not in uttid2text, f"repeated uttid: {line}"
+            if len(cols) == 1:
+                uttid2text[cols[0]] = ""
+                continue
+            txt = " ".join(cols[1:])
+            if rm_special:
+                txt = " ".join([t for t in re.split("<\|.*?\|>", txt) if t.strip() != ""])
+            if do_tn:
+                import cn2an
+                txt = cn2an.transform(txt, "an2cn")
+            uttid2text[cols[0]] = txt
+    return uttid2text
+def text2tokens(text):
+    PUNCTUATIONS = "，。？！,\.?!＂＃＄％＆＇（）＊＋－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､　、〃〈〉《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏﹑﹔·｡\":" + "()\[\]{}/;`|=+"
+    if text == "":
+        return []
+    tokens = []
+    text = re.sub("<unk>", "", text)
+    text = re.sub(r"[%s]+" % PUNCTUATIONS, " ", text)
+    pattern = re.compile(r'([\u4e00-\u9fff])')
+    parts = pattern.split(text.strip().upper())
+    parts = [p for p in parts if len(p.strip()) > 0]
+    for part in parts:
+        if pattern.fullmatch(part) is not None:
+            tokens.append(part)
+        else:
+            for word in part.strip().split():
+                tokens.append(word)
+    return tokens
+def compute_uttid2wer_info(refs, hyps, print_sentence_wer=False):
+    print(f">>> Compute uttid to wer info", flush=True)
+    uttid2wer_info = OrderedDict()
+    wer_stat = WerStats()
+    en_dig_stat = EnDigStats()
+    for uttid, ref in refs.items():
+        if uttid not in hyps:
+            print(f"[WARN] No hyp for {uttid}", flush=True)
+            continue
+        hyp = hyps[uttid]
+        if len(hyp) - len(ref) >= 8:
+            print(f"[BidLengthDiff]: {uttid} {len(ref)} {len(hyp)}#{' '.join(ref)}#{' '.join(hyp)}")
+            #continue
+        wer_info = compute_one_wer_info(ref, hyp)
+        uttid2wer_info[uttid] = wer_info
+        ns = count_english_ditgit(ref, hyp, wer_info)
+        wer_stat.add(wer_info)
+        en_dig_stat.add(*ns)
+        if print_sentence_wer:
+            print(f"{uttid} {wer_info}")
+    return uttid2wer_info, wer_stat, en_dig_stat
+COST_SUB = 3
+COST_DEL = 3
+COST_INS = 3
+ALIGN_CRT = 0
+ALIGN_SUB = 1
+ALIGN_DEL = 2
+ALIGN_INS = 3
+ALIGN_END = 4
+def compute_one_wer_info(ref, hyp):
+    """Impl minimum edit distance and backtrace.
+    Args:
+        ref, hyp: List[str]
+    Returns:
+        WerInfo
+    """
+    ref_len = len(ref)
+    hyp_len = len(hyp)
+    class _DpPoint:
+        def __init__(self, cost, align):
+            self.cost = cost
+            self.align = align
+    dp = []
+    for i in range(0, ref_len + 1):
+        dp.append([])
+        for j in range(0, hyp_len + 1):
+            dp[-1].append(_DpPoint(i * j, ALIGN_CRT))
+    # Initialize
+    for i in range(1, hyp_len + 1):
+        dp[0][i].cost = dp[0][i - 1].cost + COST_INS;
+        dp[0][i].align = ALIGN_INS
+    for i in range(1, ref_len + 1):
+        dp[i][0].cost = dp[i - 1][0].cost + COST_DEL
+        dp[i][0].align = ALIGN_DEL
+    # DP
+    for i in range(1, ref_len + 1):
+        for j in range(1, hyp_len + 1):
+            min_cost = 0
+            min_align = ALIGN_CRT
+            if hyp[j - 1] == ref[i - 1]:
+                min_cost = dp[i - 1][j - 1].cost
+                min_align = ALIGN_CRT
+            else:
+                min_cost = dp[i - 1][j - 1].cost + COST_SUB
+                min_align = ALIGN_SUB
+            del_cost = dp[i - 1][j].cost + COST_DEL
+            if del_cost < min_cost:
+                min_cost = del_cost
+                min_align = ALIGN_DEL
+            ins_cost = dp[i][j - 1].cost + COST_INS
+            if ins_cost < min_cost:
+                min_cost = ins_cost
+                min_align = ALIGN_INS
+            dp[i][j].cost = min_cost
+            dp[i][j].align = min_align
+    # Backtrace
+    crt = sub = ins = det = 0
+    i = ref_len
+    j = hyp_len
+    align = []
+    while i > 0 or j > 0:
+        if dp[i][j].align == ALIGN_CRT:
+            align.append((i, j, ALIGN_CRT))
+            i -= 1
+            j -= 1
+            crt += 1
+        elif dp[i][j].align == ALIGN_SUB:
+            align.append((i, j, ALIGN_SUB))
+            i -= 1
+            j -= 1
+            sub += 1
+        elif dp[i][j].align == ALIGN_DEL:
+            align.append((i, j, ALIGN_DEL))
+            i -= 1
+            det += 1
+        elif dp[i][j].align == ALIGN_INS:
+            align.append((i, j, ALIGN_INS))
+            j -= 1
+            ins += 1
+    err = sub + det + ins
+    align.reverse()
+    wer_info = WerInfo(ref_len, err, crt, sub, det, ins, align)
+    return wer_info
+class WerInfo:
+    def __init__(self, ref, err, crt, sub, dele, ins, ali):
+        self.r = ref
+        self.e = err
+        self.c = crt
+        self.s = sub
+        self.d = dele
+        self.i = ins
+        self.ali = ali
+        r = max(self.r, 1)
+        self.wer = 100.0 * (self.s + self.d + self.i) / r
+    def __repr__(self):
+        s = f"wer {self.wer:.2f} ref {self.r:2d} sub {self.s:2d} del {self.d:2d} ins {self.i:2d}"
+        return s
+class WerStats:
+    def __init__(self):
+        self.infos = []
+    def add(self, wer_info):
+        self.infos.append(wer_info)
+    def print(self):
+        r = sum(info.r for info in self.infos)
+        if r <= 0:
+            print(f"REF len is {r}, check")
+            r = 1
+        s = sum(info.s for info in self.infos)
+        d = sum(info.d for info in self.infos)
+        i = sum(info.i for info in self.infos)
+        se = 100.0 * s / r
+        de = 100.0 * d / r
+        ie = 100.0 * i / r
+        wer = 100.0 * (s + d + i) / r
+        sen = max(len(self.infos), 1)
+        errsen = sum(info.e > 0 for info in self.infos)
+        ser = 100.0 * errsen / sen
+        print("-"*80)
+        print(f"ref{r:6d} sub{s:6d} del{d:6d} ins{i:6d}")
+        print(f"WER{wer:6.2f} sub{se:6.2f} del{de:6.2f} ins{ie:6.2f}")
+        print(f"SER{ser:6.2f} = {errsen} / {sen}")
+        print("-"*80)
+class EnDigStats:
+    def __init__(self):
+        self.n_en_word = 0
+        self.n_en_correct = 0
+        self.n_dig_word = 0
+        self.n_dig_correct = 0
+    def add(self, n_en_word, n_en_correct, n_dig_word, n_dig_correct):
+        self.n_en_word += n_en_word
+        self.n_en_correct += n_en_correct
+        self.n_dig_word += n_dig_word
+        self.n_dig_correct += n_dig_correct
+    def print(self):
+        print(f"English #word={self.n_en_word}, #correct={self.n_en_correct}\n"
+              f"Digit #word={self.n_dig_word}, #correct={self.n_dig_correct}")
+        print("-"*80)
+def count_english_ditgit(ref, hyp, wer_info):
+    patt_en = "[a-zA-Z\.\-\']+"
+    patt_dig = "[0-9]+"
+    patt_cjk = re.compile(r'([\u4e00-\u9fff])')
+    n_en_word = 0
+    n_en_correct = 0
+    n_dig_word = 0
+    n_dig_correct = 0
+    ali = wer_info.ali
+    for i, token in enumerate(ref):
+        if re.match(patt_en, token):
+            n_en_word += 1
+            for y in ali:
+                if y[0] == i+1 and y[2] == ALIGN_CRT:
+                    j = y[1] - 1
+                    n_en_correct += 1
+                    break
+        if re.match(patt_dig, token):
+            n_dig_word += 1
+            for y in ali:
+                if y[0] == i+1 and y[2] == ALIGN_CRT:
+                    j = y[1] - 1
+                    n_dig_correct += 1
+                    break
+        if not re.match(patt_cjk, token) and not re.match(patt_en, token) \
+           and not re.match(patt_dig, token):
+            print("[WiredChar]:", token)
+    return n_en_word, n_en_correct, n_dig_word, n_dig_correct
+if __name__ == "__main__":
+    args = parser.parse_args()
+    print(args, flush=True)
+    main(args)

fireredasr/pretrained_models/README.md ADDED Viewed

	@@ -0,0 +1 @@


1	+ Put pretrained models here.

fireredasr/requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+cn2an>=0.5.23
+kaldiio>=2.18.0
+kaldi_native_fbank>=1.15
+numpy>=1.26.1
+peft>=0.13.2
+sentencepiece
+torch>=2.0.0
+transformers>=4.46.3