Spaces:

mnhatdaous
/

learnable-speech

Sleeping

App Files Files Community

primepake commited on Jul 20

Commit

f768eb3

1 Parent(s): 7940474

add s3 tokenizer

Browse files

Files changed (18) hide show

speech/tools/S3Tokenizer/.flake8 +28 -0
speech/tools/S3Tokenizer/.github/workflows/python-publish.yml +37 -0
speech/tools/S3Tokenizer/.github/workflows/unit_test_cpu.yaml +47 -0
speech/tools/S3Tokenizer/.gitignore +162 -0
speech/tools/S3Tokenizer/.pre-commit-config.yaml +14 -0
speech/tools/S3Tokenizer/LICENSE +201 -0
speech/tools/S3Tokenizer/MANIFEST.in +4 -0
speech/tools/S3Tokenizer/README.md +150 -0
speech/tools/S3Tokenizer/requirements.txt +7 -0
speech/tools/S3Tokenizer/s3tokenizer/__init__.py +153 -0
speech/tools/S3Tokenizer/s3tokenizer/assets/mel_filters.npz +0 -0
speech/tools/S3Tokenizer/s3tokenizer/cli.py +212 -0
speech/tools/S3Tokenizer/s3tokenizer/model.py +546 -0
speech/tools/S3Tokenizer/s3tokenizer/model_v2.py +604 -0
speech/tools/S3Tokenizer/s3tokenizer/utils.py +390 -0
speech/tools/S3Tokenizer/setup.py +37 -0
speech/tools/S3Tokenizer/test/test_batch_efficiency.py +272 -0
speech/tools/S3Tokenizer/test/test_onnx.py +377 -0

speech/tools/S3Tokenizer/.flake8 ADDED Viewed

	@@ -0,0 +1,28 @@

+[flake8]
+# Suggested config from pytorch that we can adapt
+select = B,C,E,F,N,P,T4,W,B9,TOR0,TOR1,TOR2
+max-line-length = 120
+# C408 ignored because we like the dict keyword argument syntax
+# E501 is not flexible enough, we're using B950 instead
+# N812 ignored because import torch.nn.functional as F is PyTorch convention
+# N817 ignored because importing using acronyms is convention (DistributedDataParallel as DDP)
+# E731 allow usage of assigning lambda expressions
+# N803,N806 allow caps and mixed case in function params. This is to work with Triton kernel coding style.
+ignore =
+    E203,E305,E402,E501,E721,E741,F405,F821,F841,F999,W503,W504,C408,E302,W291,E303,N812,N817,E731,N803,N806
+    # shebang has extra meaning in fbcode lints, so I think it's not worth trying
+    # to line this up with executable bit
+    EXE001,
+    # these ignores are from flake8-bugbear; please fix!
+    B007,B008,
+optional-ascii-coding = True
+exclude =
+    ./.git,
+    ./docs
+    ./build
+    ./scripts,
+    ./venv,
+    *.pyi
+    .pre-commit-config.yaml
+    *.md
+    .flake8

speech/tools/S3Tokenizer/.github/workflows/python-publish.yml ADDED Viewed

	@@ -0,0 +1,37 @@

+name: Release
+on:
+  push:
+    branches:
+    - main
+jobs:
+  deploy:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v3
+    - uses: actions-ecosystem/action-regex-match@v2
+      id: regex-match
+      with:
+        text: ${{ github.event.head_commit.message }}
+        regex: '^Release ([^ ]+)'
+    - name: Set up Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: '3.8'
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install build twine
+    - name: Release
+      if: ${{ steps.regex-match.outputs.match != '' }}
+      uses: softprops/action-gh-release@v1
+      with:
+        tag_name: v${{ steps.regex-match.outputs.group1 }}
+    - name: Build and publish
+      if: ${{ steps.regex-match.outputs.match != '' }}
+      env:
+        TWINE_USERNAME: __token__
+        TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }}
+      run: |
+        python -m build
+        twine upload dist/*

speech/tools/S3Tokenizer/.github/workflows/unit_test_cpu.yaml ADDED Viewed

	@@ -0,0 +1,47 @@

+name: CPU Unit Test
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+concurrency:
+  group: unit-test${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
+  cancel-in-progress: true
+jobs:
+  unit-test:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      max-parallel: 20
+      matrix:
+        os: [ubuntu-22.04]
+        python-version: [3.10.16]
+    steps:
+      - name: Cache Python Packages
+        uses: actions/cache@v4
+        with:
+          path: ~/.cache/pip
+          key: ${{ runner.os }}-pip-${{ matrix.python-version }}-${{ hashFiles('**/pyproject.toml') }}
+      - name: Setup Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+          architecture: x64
+      - name: Fetch S3Tokenizer
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          ref: ${{ github.event.pull_request.head.ref || github.ref }}
+      - name: Install S3Tokenizer Dependencies
+        run: |
+          set -eux
+          sudo apt update && sudo apt install -y ffmpeg  libsox-dev libsndfile1
+          pip install -e .
+      - name: Run Pytest
+        run: |
+          set -eux
+          pip install pytest onnxruntime
+          pytest --version
+          PYTHONPATH="${PYTHONPATH:-}:$(pwd)" pytest test/ -q
+          if [ $? != 0 ]; then exit 1; fi

speech/tools/S3Tokenizer/.gitignore ADDED Viewed

	@@ -0,0 +1,162 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/

speech/tools/S3Tokenizer/.pre-commit-config.yaml ADDED Viewed

	@@ -0,0 +1,14 @@

+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.5.0
+    hooks:
+    - id: trailing-whitespace
+      exclude: 's3tokenizer/assets/.*'
+  - repo: https://github.com/pre-commit/mirrors-yapf
+    rev: 'v0.32.0'
+    hooks:
+    - id: yapf
+  - repo: https://github.com/pycqa/flake8
+    rev: '3.8.2'
+    hooks:
+    - id: flake8

speech/tools/S3Tokenizer/LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

speech/tools/S3Tokenizer/MANIFEST.in ADDED Viewed

	@@ -0,0 +1,4 @@

+include requirements.txt
+include README.md
+include LICENSE
+include s3tokenizer/assets/*

speech/tools/S3Tokenizer/README.md ADDED Viewed

	@@ -0,0 +1,150 @@

+# Reverse Engineering of S3Tokenizer
+<div align="center">
+  <img src="https://arxiv.org/html/2407.04051v2/x1.png" alt="Description" width="35%" />
+  <p><em>Supervised Semantic Speech Tokenizer (S3Tokenizer)</em></p>
+</div>
+S3Tokenizer was initially introduced in CosyVoice [[Paper]](https://arxiv.org/abs/2407.04051v2) [[Repo]](https://github.com/FunAudioLLM/CosyVoice), it is a Supervised Semantic Speech Tokenizer based on the pre-trained SenseVoice-Large model, which enhances the semantic relationship of extracted tokens to textual and paralinguistic information, is robust to data noise, and reduces the reliance on clean data collection, thereby enabling the use of a broader range of data for model training.
+However, as indicated in this [[issue]](https://github.com/FunAudioLLM/CosyVoice/issues/70), the authors have no intention to open-source the PyTorch implementation of the S3Tokenizer, and only plan to release an ONNX file. Additionally, users aiming to fine-tune CosyVoice must extract speech codes offline, with the batch size restricted to 1, a process that is notably time-consuming (refer to [[cosyvoice/tools/extract_speech_token.py]](https://github.com/FunAudioLLM/CosyVoice/blob/main/tools/extract_speech_token.py)).
+This repository undertakes a reverse engineering of the S3Tokenizer, offering:
+1. A pure PyTorch implementation of S3Tokenizer (see [[model.py]](https://github.com/xingchensong/S3Tokenizer/blob/main/s3tokenizer/model.py)), compatible with initializing weights from the released ONNX file (see [[utils.py::onnx2torch()]](https://github.com/xingchensong/S3Tokenizer/blob/main/s3tokenizer/utils.py)).
+2. High-throughput (distributed) batch inference, achieving a ~790x speedup compared to the original inference pipeline in [[cosyvoice/tools/extract_speech_token.py]](https://github.com/FunAudioLLM/CosyVoice/blob/main/tools/extract_speech_token.py).
+3. The capability to perform online speech code extraction during SpeechLLM training.
+## Latest News 🎉
+- [2025/07/07] S3Tokenizer now has built-in **long audio processing** capabilities, requiring no additional operations from users!
+## Supported Models 🔥
+- [x] Model: [S3Tokenizer V1 50hz](https://modelscope.cn/models/iic/CosyVoice-300M)
+- [x] Model: [S3Tokenizer V1 25hz](https://modelscope.cn/models/iic/CosyVoice-300M-25Hz)
+- [x] Model: [S3Tokenizer V2 25hz](https://modelscope.cn/models/iic/CosyVoice2-0.5B)
+# Setup
+```sh
+pip install s3tokenizer
+```
+# Usage-1: Offline batch inference
+```py
+import s3tokenizer
+tokenizer = s3tokenizer.load_model("speech_tokenizer_v1").cuda()  # or "speech_tokenizer_v1_25hz speech_tokenizer_v2_25hz"
+mels = []
+wav_paths = ["s3tokenizer/assets/BAC009S0764W0121.wav", "s3tokenizer/assets/BAC009S0764W0122.wav"]
+for wav_path in wav_paths:
+    audio = s3tokenizer.load_audio(wav_path)
+    mels.append(s3tokenizer.log_mel_spectrogram(audio))
+mels, mels_lens = s3tokenizer.padding(mels)
+codes, codes_lens = tokenizer.quantize(mels.cuda(), mels_lens.cuda())  # Automatically handles long audio internally!
+for i in range(len(wav_paths)):
+    print(codes[i, :codes_lens[i].item()])
+```
+# Usage-2: Distributed offline batch inference via command-line tools
+## 2.1 CPU batch inference
+```sh
+s3tokenizer --wav_scp xxx.scp \
+            --device "cpu" \
+            --output_dir "./" \
+            --batch_size 32 \
+            --model "speech_tokenizer_v1"  # or "speech_tokenizer_v1_25hz speech_tokenizer_v2_25hz"
+```
+https://github.com/user-attachments/assets/d37d10fd-0e13-46a3-86b0-4cbec309086f
+## 2.2 (Multi) GPU batch inference (a.k.a Distributed inference)
+```sh
+torchrun --nproc_per_node=8 --nnodes=1 \
+     --rdzv_id=2024 --rdzv_backend="c10d" --rdzv_endpoint="localhost:0" \
+    `which s3tokenizer` --wav_scp xxx.scp \
+                --device "cuda" \
+                --output_dir "./" \
+                --batch_size 32 \
+                --model "speech_tokenizer_v1"  # or "speech_tokenizer_v1_25hz speech_tokenizer_v2_25hz"
+```
+https://github.com/user-attachments/assets/79a3fb11-7199-4ee2-8a35-9682a3b4d94a
+## 2.3 Performance Benchmark
+|  Method  | Time cost on Aishell Test Set | Relative speed up | Miss Rate |
+|:------:|:----------:|:--------------:|:-----:|
+|  [[cosyvoice/tools/extract_speech_token.py]](https://github.com/FunAudioLLM/CosyVoice/blob/main/tools/extract_speech_token.py), cpu |   9 hours    |    ~         | ~ |
+|  cpu, batchsize 32  |    1.5h    |    ~6x        | 0.00% |
+|  4 gpus (3090), batchsize 32 per gpu  |   41s    |   ~790x         | 0.00% |
+The miss rate represents the proportion of tokens that are inconsistent between the batch inference predictions and the ONNX (batch=1) inference predictions.
+# Usage-3: Online speech code extraction
+<table>
+<tr>
+<th>Before (extract code offline)</th>
+<th>After (extract code online)</th>
+</tr>
+<tr>
+<td>
+<sub>
+```py
+class SpeechLLM(nn.Module):
+    ...
+    def __init__(self, ...):
+        ...
+    def forward(self, speech_codes: Tensor, text_ids: Tensor, ...):
+        ...
+```
+</sub>
+<td>
+<sub>
+```py
+import s3tokenizer
+class SpeechLLM(nn.Module):
+    ...
+    def __init__(self, ...):
+        ...
+        self.speech_tokenizer = s3tokenizer.load_model("speech_tokenizer_v1")  # or "speech_tokenizer_v1_25hz"
+        self.speech_tokenizer.freeze()
+    def forward(self, speech: Tensor, speech_lens: Tensor, text_ids: Tensor, ...):
+        ...
+        speech_codes, speech_codes_lens = self.speech_tokenizer.quantize(speech, speech_lens)
+        speech_codes = speech_codes.clone()  # for backward compatbility
+        speech_codes_lens = speeech_codes_lens.clone()  # for backward compatbility
+```
+</sub>
+</td>
+</tr>
+</table>
+# Usage-4: Long Audio Processing (Built-in Automatic Processing)
+- **Automatic Detection**: Model automatically detects audio length (>30 seconds triggers long audio processing)
+- **Sliding Window**: 30-second window with 4-second overlap, automatically segments long audio
+- **Batch Processing**: Internal batch processing of multiple segments for improved efficiency
+- **Complete Transparency**: User calling method is identical to short audio

speech/tools/S3Tokenizer/requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+pre-commit
+numpy
+torch
+onnx
+tqdm
+torchaudio
+einops

speech/tools/S3Tokenizer/s3tokenizer/__init__.py ADDED Viewed

	@@ -0,0 +1,153 @@

+# Copyright (c) 2023 OpenAI. (authors: Whisper Team)
+#               2024 Tsinghua Univ. (authors: Xingchen Song)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Modified from
+    https://github.com/openai/whisper/blob/main/whisper/__init__.py
+"""
+import hashlib
+import os
+import urllib
+import warnings
+from typing import List, Union
+from tqdm import tqdm
+from s3tokenizer.model_v2 import S3TokenizerV2
+from .model import S3Tokenizer
+from .utils import (load_audio, log_mel_spectrogram, make_non_pad_mask,
+                    mask_to_bias, onnx2torch, padding, merge_tokenized_segments)
+__all__ = [
+    'load_audio', 'log_mel_spectrogram', 'make_non_pad_mask', 'mask_to_bias',
+    'onnx2torch', 'padding', 'merge_tokenized_segments'
+]
+_MODELS = {
+    "speech_tokenizer_v1":
+    "https://www.modelscope.cn/models/iic/cosyvoice-300m/"
+    "resolve/master/speech_tokenizer_v1.onnx",
+    "speech_tokenizer_v1_25hz":
+    "https://www.modelscope.cn/models/iic/CosyVoice-300M-25Hz/"
+    "resolve/master/speech_tokenizer_v1.onnx",
+    "speech_tokenizer_v2_25hz":
+    "https://www.modelscope.cn/models/iic/CosyVoice2-0.5B/"
+    "resolve/master/speech_tokenizer_v2.onnx",
+}
+_SHA256S = {
+    "speech_tokenizer_v1":
+    "23b5a723ed9143aebfd9ffda14ac4c21231f31c35ef837b6a13bb9e5488abb1e",
+    "speech_tokenizer_v1_25hz":
+    "56285ddd4a83e883ee0cb9f8d69c1089b53a94b1f78ff7e4a0224a27eb4cb486",
+    "speech_tokenizer_v2_25hz":
+    "d43342aa12163a80bf07bffb94c9de2e120a8df2f9917cd2f642e7f4219c6f71",
+}
+def _download(name: str, root: str) -> Union[bytes, str]:
+    os.makedirs(root, exist_ok=True)
+    expected_sha256 = _SHA256S[name]
+    url = _MODELS[name]
+    download_target = os.path.join(root, f"{name}.onnx")
+    if os.path.exists(download_target) and not os.path.isfile(download_target):
+        raise RuntimeError(
+            f"{download_target} exists and is not a regular file")
+    if os.path.isfile(download_target):
+        with open(download_target, "rb") as f:
+            model_bytes = f.read()
+        if hashlib.sha256(model_bytes).hexdigest() == expected_sha256:
+            return download_target
+        else:
+            warnings.warn(
+                f"{download_target} exists, but the SHA256 checksum does not"
+                " match; re-downloading the file")
+    with urllib.request.urlopen(url) as source, open(download_target,
+                                                     "wb") as output:
+        with tqdm(
+                total=int(source.info().get("Content-Length")),
+                ncols=80,
+                unit="iB",
+                unit_scale=True,
+                unit_divisor=1024,
+                desc="Downloading onnx checkpoint",
+        ) as loop:
+            while True:
+                buffer = source.read(8192)
+                if not buffer:
+                    break
+                output.write(buffer)
+                loop.update(len(buffer))
+    model_bytes = open(download_target, "rb").read()
+    if hashlib.sha256(model_bytes).hexdigest() != expected_sha256:
+        raise RuntimeError(
+            "Model has been downloaded but the SHA256 checksum does not not"
+            " match. Please retry loading the model.")
+    return download_target
+def available_models() -> List[str]:
+    """Returns the names of available models"""
+    return list(_MODELS.keys())
+def load_model(
+    name: str,
+    download_root: str = None,
+) -> S3Tokenizer:
+    """
+    Load a S3Tokenizer ASR model
+    Parameters
+    ----------
+    name : str
+        one of the official model names listed by
+        `s3tokenizer.available_models()`, or path to a model checkpoint
+         containing the model dimensions and the model state_dict.
+    download_root: str
+        path to download the model files; by default,
+        it uses "~/.cache/s3tokenizer"
+    Returns
+    -------
+    model : S3Tokenizer
+        The S3Tokenizer model instance
+    """
+    if download_root is None:
+        default = os.path.join(os.path.expanduser("~"), ".cache")
+        download_root = os.path.join(os.getenv("XDG_CACHE_HOME", default),
+                                     "s3tokenizer")
+    if name in _MODELS:
+        checkpoint_file = _download(name, download_root)
+    elif os.path.isfile(name):
+        checkpoint_file = name
+    else:
+        raise RuntimeError(
+            f"Model {name} not found; available models = {available_models()}")
+    if 'v2' in name:
+        model = S3TokenizerV2(name)
+    else:
+        model = S3Tokenizer(name)
+    model.init_from_onnx(checkpoint_file)
+    return model

speech/tools/S3Tokenizer/s3tokenizer/assets/mel_filters.npz ADDED Viewed

Binary file (4.27 kB). View file

speech/tools/S3Tokenizer/s3tokenizer/cli.py ADDED Viewed

	@@ -0,0 +1,212 @@

+# Copyright (c) 2024 Tsinghua Univ. (authors: Xingchen Song)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Example Usage
+cpu:
+s3tokenizer --root_path /path/to/audio/files \
+            --model speech_tokenizer_v2_25hz \
+            --device "cpu" \
+            --batch_size 32
+gpu:
+torchrun --nproc_per_node=1 --nnodes=1 \
+     --rdzv_id=2024 --rdzv_backend="c10d" --rdzv_endpoint="localhost:0" \
+    `which s3tokenizer` --root_path /data/dataset \
+                --model speech_tokenizer_v2_25hz \
+                --device "cuda" \
+                --batch_size 64
+"""
+import argparse
+import os
+from pathlib import Path
+import torch
+import torch.distributed as dist
+from torch.utils.data import DataLoader, Dataset, DistributedSampler
+from tqdm import tqdm
+import s3tokenizer
+class AudioDataset(Dataset):
+    def __init__(self, root_path, extensions=['.wav', '.flac', '.mp3']):
+        self.data = []
+        # Recursively find all audio files
+        root = Path(root_path)
+        for ext in extensions:
+            self.data.extend(root.rglob(f'*{ext}'))
+        # Sort for consistent ordering
+        self.data.sort()
+        if len(self.data) == 0:
+            raise ValueError(f"No audio files found in {root_path}")
+        print(f"Found {len(self.data)} audio files")
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, idx):
+        file_path = self.data[idx]
+        audio = s3tokenizer.load_audio(str(file_path))
+        mel = s3tokenizer.log_mel_spectrogram(audio)
+        return file_path, mel
+def collate_fn(batch):
+    file_paths = [item[0] for item in batch]
+    mels = [item[1] for item in batch]
+    mels, mels_lens = s3tokenizer.padding(mels)
+    return file_paths, mels, mels_lens
+def init_distributed():
+    world_size = int(os.environ.get('WORLD_SIZE', 1))
+    local_rank = int(os.environ.get('LOCAL_RANK', 0))
+    rank = int(os.environ.get('RANK', 0))
+    print('Inference on multiple gpus, this gpu {}'.format(local_rank) +
+          ', rank {}, world_size {}'.format(rank, world_size))
+    torch.cuda.set_device(local_rank)
+    dist.init_process_group("nccl")
+    return world_size, local_rank, rank
+def get_args():
+    parser = argparse.ArgumentParser(description='extract speech code')
+    parser.add_argument('--model',
+                        required=True,
+                        type=str,
+                        choices=[
+                            "speech_tokenizer_v1", "speech_tokenizer_v1_25hz",
+                            "speech_tokenizer_v2_25hz"
+                        ],
+                        help='model version')
+    parser.add_argument('--root_path',
+                        required=True,
+                        type=str,
+                        help='root directory containing audio files')
+    parser.add_argument('--device',
+                        required=True,
+                        type=str,
+                        choices=["cuda", "cpu"],
+                        help='device for inference')
+    parser.add_argument('--batch_size',
+                        required=True,
+                        type=int,
+                        help='batch size (per-device) for inference')
+    parser.add_argument('--num_workers',
+                        type=int,
+                        default=4,
+                        help='workers for dataloader')
+    parser.add_argument('--prefetch',
+                        type=int,
+                        default=5,
+                        help='prefetch for dataloader')
+    parser.add_argument('--extensions',
+                        nargs='+',
+                        default=['.wav', '.flac', '.mp3'],
+                        help='audio file extensions to process')
+    args = parser.parse_args()
+    return args
+def save_tokens(file_path, codes, codes_len):
+    """Save tokens as .pt file with _fsq suffix"""
+    # Remove extension and add _fsq.pt
+    output_path = file_path.with_suffix('').with_suffix('.pt')
+    output_path = output_path.parent / f"{output_path.stem}_fsq.pt"
+    # Extract only valid codes (up to codes_len)
+    valid_codes = codes[:codes_len]
+    # convert valid codes to list
+    valid_codes = valid_codes.tolist()
+    # Save as tensor
+    torch.save(valid_codes, output_path)
+    return output_path
+def main():
+    args = get_args()
+    if args.device == "cuda":
+        assert (torch.cuda.is_available())
+        world_size, local_rank, rank = init_distributed()
+    else:
+        world_size, local_rank, rank = 1, 0, 0
+    device = torch.device(args.device)
+    model = s3tokenizer.load_model(args.model).to(device)
+    dataset = AudioDataset(args.root_path, args.extensions)
+    if args.device == "cuda":
+        model = torch.nn.parallel.DistributedDataParallel(
+            model, device_ids=[local_rank])
+        sampler = DistributedSampler(dataset,
+                                     num_replicas=world_size,
+                                     rank=rank)
+    else:
+        sampler = None
+    dataloader = DataLoader(dataset,
+                            batch_size=args.batch_size,
+                            sampler=sampler,
+                            shuffle=False,
+                            num_workers=args.num_workers,
+                            prefetch_factor=args.prefetch,
+                            collate_fn=collate_fn)
+    total_steps = len(dataset)
+    if rank == 0:
+        progress_bar = tqdm(total=total_steps, desc="Processing", unit="wavs")
+    processed_count = 0
+    for file_paths, mels, mels_lens in dataloader:
+        codes, codes_lens = model(mels.to(device), mels_lens.to(device))
+        # Process each file in the batch
+        for i, file_path in enumerate(file_paths):
+            code = codes[i]
+            code_len = codes_lens[i].item()
+            # Save tokens as .pt file
+            output_path = save_tokens(file_path, code, code_len)
+            if rank == 0:
+                tqdm.write(f"Saved: {file_path} -> {output_path}")
+        processed_count += len(file_paths)
+        if rank == 0:
+            progress_bar.update(world_size * len(file_paths))
+    if rank == 0:
+        progress_bar.close()
+        print(f"\nProcessed {processed_count} files on rank {rank}")
+    if args.device == "cuda":
+        dist.barrier()
+        dist.destroy_process_group()
+if __name__ == "__main__":
+    main()

speech/tools/S3Tokenizer/s3tokenizer/model.py ADDED Viewed

	@@ -0,0 +1,546 @@

+# Copyright (c) 2023 OpenAI. (authors: Whisper Team)
+#               2024 Tsinghua Univ. (authors: Xingchen Song)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Modified from https://github.com/openai/whisper/blob/main/whisper/model.py
+   Add EuclideanCodebook & VectorQuantization
+"""
+from dataclasses import dataclass
+from typing import Iterable, Optional, Tuple
+import numpy as np
+import torch
+import torch.nn.functional as F
+from einops import rearrange
+from torch import Tensor, nn
+from .utils import make_non_pad_mask, mask_to_bias, onnx2torch, merge_tokenized_segments
+@dataclass
+class ModelConfig:
+    n_mels: int = 128
+    n_audio_ctx: int = 1500
+    n_audio_state: int = 1280
+    n_audio_head: int = 20
+    n_audio_layer: int = 6
+    n_codebook_size: int = 4096
+    use_sdpa: bool = False
+class LayerNorm(nn.LayerNorm):
+    def forward(self, x: Tensor) -> Tensor:
+        return super().forward(x.float()).type(x.dtype)
+class Linear(nn.Linear):
+    def forward(self, x: Tensor) -> Tensor:
+        return F.linear(
+            x,
+            self.weight.to(x.dtype),
+            None if self.bias is None else self.bias.to(x.dtype),
+        )
+class Conv1d(nn.Conv1d):
+    def _conv_forward(self, x: Tensor, weight: Tensor,
+                      bias: Optional[Tensor]) -> Tensor:
+        return super()._conv_forward(
+            x, weight.to(x.dtype), None if bias is None else bias.to(x.dtype))
+def sinusoids(length, channels, max_timescale=10000):
+    """Returns sinusoids for positional embedding"""
+    assert channels % 2 == 0
+    log_timescale_increment = np.log(max_timescale) / (channels // 2 - 1)
+    inv_timescales = torch.exp(-log_timescale_increment *
+                               torch.arange(channels // 2))
+    scaled_time = torch.arange(length)[:, np.newaxis] * inv_timescales[
+        np.newaxis, :]
+    return torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1)
+class MultiHeadAttention(nn.Module):
+    def __init__(self, n_state: int, n_head: int, use_sdpa: bool = False):
+        super().__init__()
+        self.n_head = n_head
+        self.query = Linear(n_state, n_state)
+        self.key = Linear(n_state, n_state, bias=False)
+        self.value = Linear(n_state, n_state)
+        self.out = Linear(n_state, n_state)
+        self.use_sdpa = use_sdpa
+    def forward(
+        self,
+        x: Tensor,
+        mask: Optional[Tensor] = None,
+    ):
+        q = self.query(x)
+        k = self.key(x)
+        v = self.value(x)
+        wv, qk = self.qkv_attention(q, k, v, mask)
+        return self.out(wv), qk
+    def qkv_attention(self,
+                      q: Tensor,
+                      k: Tensor,
+                      v: Tensor,
+                      mask: Optional[Tensor] = None):
+        _, _, D = q.shape
+        scale = (D // self.n_head)**-0.25
+        q = q.view(*q.shape[:2], self.n_head, -1).permute(0, 2, 1, 3) * scale
+        k = k.view(*k.shape[:2], self.n_head, -1)
+        v = v.view(*v.shape[:2], self.n_head, -1).permute(0, 2, 1, 3)
+        if not self.use_sdpa:
+            k = k.permute(0, 2, 3, 1) * scale
+            qk = q @ k  # (B, n_head, T, T)
+            if mask is not None:
+                qk = qk + mask
+            qk = qk.float()
+            w = torch.nn.functional.softmax(qk, dim=-1).to(q.dtype)
+            return (w @ v).permute(0, 2, 1,
+                                   3).flatten(start_dim=2), qk.detach()
+        else:
+            k = k.permute(0, 2, 1, 3) * scale
+            assert mask is not None
+            output = torch.nn.functional.scaled_dot_product_attention(
+                q,
+                k,
+                v,
+                attn_mask=mask,
+                dropout_p=0.,
+                scale=1.,
+            )
+            output = (output.transpose(1,
+                                       2).contiguous().view(q.size(0), -1, D)
+                      )  # (batch, time1, d_model)
+            return output, None
+class ResidualAttentionBlock(nn.Module):
+    def __init__(self, n_state: int, n_head: int, use_sdpa: bool):
+        super().__init__()
+        self.attn = MultiHeadAttention(n_state, n_head, use_sdpa=use_sdpa)
+        self.attn_ln = LayerNorm(n_state)
+        n_mlp = n_state * 4
+        self.mlp = nn.Sequential(Linear(n_state, n_mlp), nn.GELU(),
+                                 Linear(n_mlp, n_state))
+        self.mlp_ln = LayerNorm(n_state)
+    def forward(
+        self,
+        x: Tensor,
+        mask: Optional[Tensor] = None,
+    ):
+        x = x + self.attn(self.attn_ln(x), mask=mask)[0]
+        x = x + self.mlp(self.mlp_ln(x))
+        return x
+class AudioEncoder(nn.Module):
+    def __init__(
+        self,
+        n_mels: int,
+        n_ctx: int,
+        n_state: int,
+        n_head: int,
+        n_layer: int,
+        stride: int,
+        use_sdpa: bool,
+    ):
+        super().__init__()
+        self.stride = stride
+        self.conv1 = Conv1d(n_mels,
+                            n_state,
+                            kernel_size=3,
+                            stride=stride,
+                            padding=1)
+        self.conv2 = Conv1d(n_state,
+                            n_state,
+                            kernel_size=3,
+                            stride=2,
+                            padding=1)
+        self.register_buffer("positional_embedding", sinusoids(n_ctx, n_state))
+        self.blocks: Iterable[ResidualAttentionBlock] = nn.ModuleList([
+            ResidualAttentionBlock(n_state, n_head, use_sdpa=use_sdpa)
+            for _ in range(n_layer)
+        ])
+    def forward(self, x: Tensor, x_len: Tensor) -> Tuple[Tensor, Tensor]:
+        """
+        x : torch.Tensor, shape = (batch_size, n_mels, T)
+            the mel spectrogram of the audio
+        x_len: torch.Tensor, shape = (batch_size,)
+            length of each audio in x
+        """
+        mask = make_non_pad_mask(x_len).unsqueeze(1)
+        x = F.gelu(self.conv1(x * mask))
+        x_len = (x_len + 2 - 1 * (3 - 1) - 1) // self.stride + 1
+        mask = make_non_pad_mask(x_len).unsqueeze(1)
+        x = F.gelu(self.conv2(x * mask))
+        x_len = (x_len + 2 - 1 * (3 - 1) - 1) // 2 + 1
+        mask = make_non_pad_mask(x_len).unsqueeze(1)
+        x = x.permute(0, 2, 1)  # (B, T // 2, n_state)
+        mask = mask_to_bias(mask, x.dtype)
+        x = (x + self.positional_embedding[:x.shape[1], :]).to(x.dtype)
+        for block in self.blocks:
+            x = block(x, mask.unsqueeze(1))
+        return x, x_len
+class EuclideanCodebook(nn.Module):
+    """Codebook with Euclidean distance (inference-only).
+    Args:
+        dim (int): Dimension.
+        codebook_size (int): Codebook size.
+    """
+    def __init__(self, dim: int, codebook_size: int):
+        super().__init__()
+        embed = torch.zeros(codebook_size, dim)
+        self.codebook_size = codebook_size
+        self.register_buffer("embed", embed)
+    @torch.inference_mode()
+    def preprocess(self, x: Tensor) -> Tensor:
+        x = rearrange(x, "... d -> (...) d")
+        return x
+    @torch.inference_mode()
+    def quantize(self, x: Tensor) -> Tensor:
+        embed = self.embed.t().to(x.dtype)
+        dist = -(x.pow(2).sum(1, keepdim=True) - 2 * x @ embed +
+                 embed.pow(2).sum(0, keepdim=True))
+        embed_ind = dist.max(dim=-1).indices
+        return embed_ind
+    @torch.inference_mode()
+    def postprocess_emb(self, embed_ind, shape):
+        return embed_ind.view(*shape[:-1])
+    @torch.inference_mode()
+    def dequantize(self, embed_ind: Tensor) -> Tensor:
+        quantize = F.embedding(embed_ind, self.embed)
+        return quantize
+    @torch.inference_mode()
+    def encode(self, x: Tensor) -> Tensor:
+        shape = x.shape
+        # pre-process
+        x = self.preprocess(x)
+        # quantize
+        embed_ind = self.quantize(x)
+        # post-process
+        embed_ind = self.postprocess_emb(embed_ind, shape)
+        return embed_ind
+    @torch.inference_mode()
+    def decode(self, embed_ind: Tensor) -> Tensor:
+        quantize = self.dequantize(embed_ind)
+        return quantize
+class VectorQuantization(nn.Module):
+    """Vector quantization implementation (inference-only).
+    Args:
+        dim (int): Dimension
+        codebook_size (int): Codebook size
+    """
+    def __init__(self, dim: int, codebook_size: int):
+        super().__init__()
+        self._codebook = EuclideanCodebook(dim=dim,
+                                           codebook_size=codebook_size)
+        self.codebook_size = codebook_size
+    @property
+    def codebook(self):
+        return self._codebook.embed
+    @torch.inference_mode()
+    def encode(self, x: Tensor) -> Tensor:
+        x = F.normalize(x.float(), p=2, dim=-1)
+        embed_in = self._codebook.encode(x)
+        return embed_in
+    @torch.inference_mode()
+    def decode(self, embed_ind: Tensor) -> Tensor:
+        quantize = self._codebook.decode(embed_ind)
+        quantize = rearrange(quantize, "b n d -> b d n")
+        return quantize
+class S3Tokenizer(nn.Module):
+    """S3 tokenizer implementation (inference-only).
+    Args:
+        config  (ModelConfig): Config
+    """
+    def __init__(self, name: str, config: ModelConfig = ModelConfig()):
+        super().__init__()
+        self.name = name  # Store model name for token_rate determination
+        self.config = config
+        self.encoder = AudioEncoder(
+            self.config.n_mels,
+            self.config.n_audio_ctx,
+            self.config.n_audio_state,
+            self.config.n_audio_head,
+            self.config.n_audio_layer,
+            2 if name == "speech_tokenizer_v1_25hz" else 1,
+            self.config.use_sdpa,
+        )
+        self.quantizer = VectorQuantization(self.config.n_audio_state,
+                                            self.config.n_codebook_size)
+    def forward(self, mel: Tensor, mel_len: Tensor) -> Tuple[Tensor, Tensor]:
+        return self.quantize(mel, mel_len)
+    @torch.inference_mode()
+    def quantize(self, mel: Tensor, mel_len: Tensor) -> Tuple[Tensor, Tensor]:
+        """
+        Quantize mel spectrogram to tokens, with automatic long audio handling.
+        Args:
+            mel: mel spectrogram tensor, shape (batch_size, n_mels, T)
+            mel_len: mel length tensor, shape (batch_size,)
+        Returns:
+            code: quantized tokens, shape (batch_size, T')
+            code_len: token length, shape (batch_size,)
+        """
+        # Check if any audio in the batch exceeds 30 seconds
+        # Assuming 16kHz sample rate and hop_length=160, 30s = 30*16000/160 = 3000 frames
+        max_frames = 3000
+        # Check which samples are long audio
+        long_audio_mask = mel_len > max_frames
+        if long_audio_mask.any():
+            # Has long audio - need special processing
+            return self._quantize_mixed_batch(mel, mel_len, long_audio_mask,
+                                              max_frames)
+        else:
+            # All short audio - use original method
+            hidden, code_len = self.encoder(mel, mel_len)
+            code = self.quantizer.encode(hidden)
+            return code, code_len
+    @torch.inference_mode()
+    def _quantize_mixed_batch(self, mel: Tensor, mel_len: Tensor,
+                              long_audio_mask: Tensor,
+                              max_frames: int) -> Tuple[Tensor, Tensor]:
+        """
+        Handle mixed batch with both short and long audio using unified batch processing.
+        Args:
+            mel: mel spectrogram tensor, shape (batch_size, n_mels, T)
+            mel_len: mel length tensor, shape (batch_size,)
+            long_audio_mask: boolean mask for long audio, shape (batch_size,)
+            max_frames: maximum frames for short audio
+        Returns:
+            code: quantized tokens, shape (batch_size, T')
+            code_len: token length, shape (batch_size,)
+        """
+        batch_size = mel.size(0)
+        # Parameters for sliding window
+        sample_rate = 16000
+        hop_length = 160  # Default hop length for mel spectrogram
+        window_size = 30  # seconds
+        overlap = 4  # seconds
+        # Calculate frame-based parameters
+        frames_per_window = window_size * sample_rate // hop_length  # 3000 frames
+        frames_per_overlap = overlap * sample_rate // hop_length  # 400 frames
+        frames_per_stride = frames_per_window - frames_per_overlap  # 2600 frames
+        # Collect all segments to process (including short and long audio segments)
+        all_segments = []
+        all_segments_len = []
+        segment_info = [
+        ]  # Record which audio each segment belongs to and whether it's long audio
+        # Process all audio in the batch
+        for batch_idx in range(batch_size):
+            audio_mel = mel[batch_idx]
+            audio_mel_len = mel_len[batch_idx]
+            is_long_audio = long_audio_mask[batch_idx].item()
+            if not is_long_audio:
+                # Short audio: process directly as a single segment
+                segment = audio_mel[:, :audio_mel_len]
+                seg_len = audio_mel_len.item()
+                # Pad to max_frames if necessary
+                if seg_len < frames_per_window:
+                    pad_size = frames_per_window - seg_len
+                    segment = F.pad(segment, (0, pad_size))
+                all_segments.append(segment)
+                all_segments_len.append(
+                    torch.tensor(seg_len, device=mel.device))
+                segment_info.append({
+                    'batch_idx': batch_idx,
+                    'is_long_audio': False,
+                    'segment_idx': 0,
+                    'total_segments': 1
+                })
+            else:
+                # Long audio: split into multiple segments
+                start = 0
+                segment_idx = 0
+                while start < audio_mel_len:
+                    end = min(start + frames_per_window, audio_mel_len)
+                    segment = audio_mel[:, start:end]
+                    seg_len = segment.size(1)
+                    # Pad if necessary
+                    if seg_len < frames_per_window:
+                        pad_size = frames_per_window - seg_len
+                        segment = F.pad(segment, (0, pad_size))
+                    all_segments.append(segment)
+                    all_segments_len.append(
+                        torch.tensor(seg_len, device=mel.device))
+                    segment_info.append({
+                        'batch_idx': batch_idx,
+                        'is_long_audio': True,
+                        'segment_idx': segment_idx,
+                        'total_segments': None  # Will be filled later
+                    })
+                    segment_idx += 1
+                    start += frames_per_stride
+                # Update total_segments info
+                total_segments = segment_idx
+                for info in segment_info:
+                    if info['batch_idx'] == batch_idx and info['is_long_audio']:
+                        info['total_segments'] = total_segments
+        if not all_segments:
+            # Fallback if no segments
+            return torch.zeros(batch_size,
+                               0,
+                               dtype=torch.long,
+                               device=mel.device), torch.zeros(
+                                   batch_size,
+                                   dtype=torch.long,
+                                   device=mel.device)
+        # Unified batch processing for all segments
+        unified_batch_mel = torch.stack(all_segments)
+        unified_batch_lens = torch.stack(all_segments_len)
+        # Process all segments at once
+        hidden, code_len = self.encoder(unified_batch_mel, unified_batch_lens)
+        codes = self.quantizer.encode(hidden)
+        # Reorganize results based on segment_info
+        results = {}  # batch_idx -> (code_tensor, code_len)
+        for seg_idx, info in enumerate(segment_info):
+            batch_idx = info['batch_idx']
+            is_long_audio = info['is_long_audio']
+            segment_idx = info['segment_idx']
+            # Get codes for current segment
+            segment_code = codes[
+                seg_idx, :code_len[seg_idx].item()].cpu().numpy().tolist()
+            if not is_long_audio:
+                # Short audio: use directly
+                code_tensor = torch.tensor(segment_code,
+                                           dtype=torch.long,
+                                           device=mel.device)
+                results[batch_idx] = (code_tensor, len(segment_code))
+            else:
+                # Long audio: collect all segments
+                if batch_idx not in results:
+                    results[batch_idx] = []
+                results[batch_idx].append(segment_code)
+        # Process long audio segment merging
+        for batch_idx in range(batch_size):
+            if long_audio_mask[batch_idx].item():
+                # Merge long audio segments
+                audio_codes = results[batch_idx]
+                # Determine token rate based on model name
+                if hasattr(self,
+                           'name') and self.name == "speech_tokenizer_v1":
+                    token_rate = 50
+                else:
+                    token_rate = 25
+                merged_codes = merge_tokenized_segments(audio_codes,
+                                                        overlap=overlap,
+                                                        token_rate=token_rate)
+                # Convert to tensor
+                merged_codes_tensor = torch.tensor(merged_codes,
+                                                   dtype=torch.long,
+                                                   device=mel.device)
+                results[batch_idx] = (merged_codes_tensor, len(merged_codes))
+        # Construct final output
+        max_code_len = max(code_info[1] for code_info in results.values())
+        output_codes = torch.zeros(batch_size,
+                                   max_code_len,
+                                   dtype=torch.long,
+                                   device=mel.device)
+        output_codes_len = torch.zeros(batch_size,
+                                       dtype=torch.long,
+                                       device=mel.device)
+        for batch_idx, (code_tensor, code_len) in results.items():
+            output_codes[batch_idx, :code_len] = code_tensor
+            output_codes_len[batch_idx] = code_len
+        return output_codes, output_codes_len
+    @property
+    def device(self):
+        return next(self.parameters()).device
+    def init_from_onnx(self, onnx_path: str):
+        ckpt = onnx2torch(onnx_path, None, False)
+        self.load_state_dict(ckpt, strict=True)
+    def init_from_pt(self, ckpt_path: str):
+        ckpt = torch.load(ckpt_path, map_location="cpu", mmap=True)
+        self.load_state_dict(ckpt, strict=True)
+    def freeze(self):
+        for _, param in self.named_parameters():
+            param.requires_grad = False

speech/tools/S3Tokenizer/s3tokenizer/model_v2.py ADDED Viewed

	@@ -0,0 +1,604 @@

+# Copyright (c)  (Mddct: Dinghao Zhou)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from typing import Optional, Tuple
+import torch
+from einops import rearrange
+from s3tokenizer.model import Conv1d, LayerNorm, Linear, MultiHeadAttention
+from s3tokenizer.utils import make_non_pad_mask, mask_to_bias, onnx2torch, merge_tokenized_segments
+@dataclass
+class ModelConfig:
+    n_mels: int = 128
+    n_audio_ctx: int = 1500
+    n_audio_state: int = 1280
+    n_audio_head: int = 20
+    n_audio_layer: int = 6
+    n_codebook_size: int = 3**8
+    use_sdpa: bool = False
+def precompute_freqs_cis(dim: int,
+                         end: int,
+                         theta: float = 10000.0,
+                         scaling=None):
+    freqs = 1.0 / (theta**(torch.arange(0, dim, 2)[:(dim // 2)].float() / dim))
+    t = torch.arange(end, device=freqs.device)  # type: ignore
+    if scaling is not None:
+        t = t * scaling
+    freqs = torch.outer(t, freqs).float()  # type: ignore
+    freqs_cis = torch.polar(torch.ones_like(freqs), freqs)  # complex64
+    return torch.cat((freqs_cis, freqs_cis), dim=-1)
+def apply_rotary_emb(
+    xq: torch.Tensor,
+    xk: torch.Tensor,
+    freqs_cis: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    real = torch.view_as_real(freqs_cis)
+    cos, sin = real[:, :, 0], real[:, :, 1]
+    cos = cos.unsqueeze(0).unsqueeze(2)
+    sin = sin.unsqueeze(0).unsqueeze(2)
+    D = xq.shape[-1]
+    half_l, half_r = xq[:, :, :, :D // 2], xq[:, :, :, D // 2:]
+    xq_r = torch.cat((-half_r, half_l), dim=-1)
+    D = xk.shape[-1]
+    half_l, half_r = xk[:, :, :, :D // 2], xk[:, :, :, D // 2:]
+    xk_r = torch.cat((-half_r, half_l), dim=-1)
+    return xq * cos + xq_r * sin, xk * cos + xk_r * sin
+def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor):
+    ndim = x.ndim
+    assert 0 <= 1 < ndim
+    assert freqs_cis.shape == (x.shape[1], x.shape[-1])
+    shape = [
+        d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)
+    ]
+    return freqs_cis.view(*shape)
+class FSQCodebook(torch.nn.Module):
+    def __init__(self, dim: int, level: int = 3):
+        super().__init__()
+        self.project_down = torch.nn.Linear(dim, 8)
+        self.level = level
+        self.embed = None
+    @torch.inference_mode()
+    def preprocess(self, x: torch.Tensor) -> torch.Tensor:
+        x = rearrange(x, "... d -> (...) d")
+        return x
+    @torch.inference_mode()
+    def encode(self, x: torch.Tensor) -> torch.Tensor:
+        x_shape = x.shape
+        # pre-process
+        x = self.preprocess(x)
+        # quantize
+        h = self.project_down(x).float()
+        h = h.tanh()
+        h = h * 0.9990000128746033
+        h = h.round() + 1
+        # h = ((self.level - 1) * h).round()  # range [-k, k]
+        powers = torch.pow(
+            self.level,
+            torch.arange(2**self.level, device=x.device, dtype=h.dtype))
+        mu = torch.sum(h * powers.unsqueeze(0), dim=-1)
+        ind = mu.reshape(x_shape[0], x_shape[1]).int()
+        return ind
+    @torch.inference_mode()
+    def decode(self, embed_ind: torch.Tensor) -> torch.Tensor:
+        raise NotImplementedError(
+            'There is no official up project component provided')
+class FSQVectorQuantization(torch.nn.Module):
+    """Vector quantization implementation (inference-only).
+    Args:
+        dim (int): Dimension
+        codebook_size (int): Codebook size
+    """
+    def __init__(
+        self,
+        dim: int,
+        codebook_size: int,
+    ):
+        super().__init__()
+        assert 3**8 == codebook_size
+        self._codebook = FSQCodebook(dim=dim, level=3)
+        self.codebook_size = codebook_size
+    @property
+    def codebook(self):
+        return self._codebook.embed
+    @torch.inference_mode()
+    def encode(self, x: torch.Tensor) -> torch.Tensor:
+        return self._codebook.encode(x)
+    @torch.inference_mode()
+    def decode(self, embed_ind: torch.Tensor) -> torch.Tensor:
+        quantize = self._codebook.decode(embed_ind)
+        quantize = rearrange(quantize, "b n d -> b d n")
+        return quantize
+class FSMNMultiHeadAttention(MultiHeadAttention):
+    def __init__(
+        self,
+        n_state: int,
+        n_head: int,
+        kernel_size: int = 31,
+        use_sdpa: bool = False,
+    ):
+        super().__init__(n_state, n_head)
+        self.fsmn_block = torch.nn.Conv1d(n_state,
+                                          n_state,
+                                          kernel_size,
+                                          stride=1,
+                                          padding=0,
+                                          groups=n_state,
+                                          bias=False)
+        self.left_padding = (kernel_size - 1) // 2
+        self.right_padding = kernel_size - 1 - self.left_padding
+        self.pad_fn = torch.nn.ConstantPad1d(
+            (self.left_padding, self.right_padding), 0.0)
+        self.use_sdpa = use_sdpa
+    def forward_fsmn(self,
+                     inputs: torch.Tensor,
+                     mask: Optional[torch.Tensor] = None):
+        b, t, _, _ = inputs.size()
+        inputs = inputs.view(b, t, -1)
+        if mask is not None and mask.size(2) > 0:  # time2 > 0
+            inputs = inputs * mask
+        x = inputs.transpose(1, 2)
+        x = self.pad_fn(x)
+        x = self.fsmn_block(x)
+        x = x.transpose(1, 2)
+        x += inputs
+        return x * mask
+    def qkv_attention(self,
+                      q: torch.Tensor,
+                      k: torch.Tensor,
+                      v: torch.Tensor,
+                      mask: Optional[torch.Tensor] = None,
+                      mask_pad: Optional[torch.Tensor] = None,
+                      freqs_cis: Optional[torch.Tensor] = None):
+        _, _, D = q.shape
+        scale = (D // self.n_head)**-0.25
+        q = q.view(*q.shape[:2], self.n_head, -1)
+        k = k.view(*k.shape[:2], self.n_head, -1)
+        v = v.view(*v.shape[:2], self.n_head, -1)
+        if freqs_cis is not None:
+            q, k = apply_rotary_emb(q, k, freqs_cis=freqs_cis)
+        fsm_memory = self.forward_fsmn(v, mask_pad)
+        q = q.permute(0, 2, 1, 3) * scale
+        v = v.permute(0, 2, 1, 3)
+        if not self.use_sdpa:
+            k = k.permute(0, 2, 3, 1) * scale
+            qk = q @ k  # (B, n_head, T, T)
+            if mask is not None:
+                qk = qk + mask
+            qk = qk.float()
+            w = torch.nn.functional.softmax(qk, dim=-1).to(q.dtype)
+            return (w @ v).permute(
+                0, 2, 1, 3).flatten(start_dim=2), qk.detach(), fsm_memory
+        else:
+            k = k.permute(0, 2, 1, 3) * scale
+            assert mask is not None
+            output = torch.nn.functional.scaled_dot_product_attention(
+                q,
+                k,
+                v,
+                attn_mask=mask,
+                dropout_p=0.,
+                scale=1.,
+            )
+            output = (output.transpose(1,
+                                       2).contiguous().view(q.size(0), -1, D)
+                      )  # (batch, time1, d_model)
+            return output, None, fsm_memory
+    def forward(self,
+                x: torch.Tensor,
+                mask: Optional[torch.Tensor] = None,
+                mask_pad: Optional[torch.Tensor] = None,
+                freqs_cis: Optional[torch.Tensor] = None):
+        q = self.query(x)
+        k = self.key(x)
+        v = self.value(x)
+        wv, qk, fsm_memory = self.qkv_attention(q, k, v, mask, mask_pad,
+                                                freqs_cis)
+        return self.out(wv) + fsm_memory, qk
+class ResidualAttentionBlock(torch.nn.Module):
+    def __init__(
+        self,
+        n_state: int,
+        n_head: int,
+        kernel_size: int = 31,
+        use_sdpa: bool = False,
+    ):
+        super().__init__()
+        self.attn = FSMNMultiHeadAttention(n_state,
+                                           n_head,
+                                           kernel_size,
+                                           use_sdpa=use_sdpa)
+        self.attn_ln = LayerNorm(n_state, eps=1e-6)
+        n_mlp = n_state * 4
+        self.mlp = torch.nn.Sequential(Linear(n_state, n_mlp), torch.nn.GELU(),
+                                       Linear(n_mlp, n_state))
+        self.mlp_ln = LayerNorm(n_state)
+    def forward(
+        self,
+        x: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+        mask_pad: Optional[torch.Tensor] = None,
+        freqs_cis: Optional[torch.Tensor] = None,
+    ):
+        x = x + self.attn(
+            self.attn_ln(x), mask=mask, mask_pad=mask_pad,
+            freqs_cis=freqs_cis)[0]
+        x = x + self.mlp(self.mlp_ln(x))
+        return x
+class AudioEncoderV2(torch.nn.Module):
+    def __init__(
+        self,
+        n_mels: int,
+        n_state: int,
+        n_head: int,
+        n_layer: int,
+        stride: int,
+        use_sdpa: bool,
+    ):
+        super().__init__()
+        self.stride = stride
+        self.conv1 = Conv1d(n_mels,
+                            n_state,
+                            kernel_size=3,
+                            stride=stride,
+                            padding=1)
+        self.conv2 = Conv1d(n_state,
+                            n_state,
+                            kernel_size=3,
+                            stride=2,
+                            padding=1)
+        self.freqs_cis = precompute_freqs_cis(64, 1024 * 2)
+        self.blocks = torch.nn.ModuleList([
+            ResidualAttentionBlock(n_state, n_head, use_sdpa=use_sdpa)
+            for _ in range(n_layer)
+        ])
+    def forward(self, x: torch.Tensor,
+                x_len: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        x : torch.Tensor, shape = (batch_size, n_mels, T)
+            the mel spectrogram of the audio
+        x_len: torch.Tensor, shape = (batch_size,)
+            length of each audio in x
+        """
+        mask = make_non_pad_mask(x_len).unsqueeze(1)
+        x = torch.nn.functional.gelu(self.conv1(x * mask))
+        x_len = (x_len + 2 - 1 * (3 - 1) - 1) // self.stride + 1
+        mask = make_non_pad_mask(x_len).unsqueeze(1)
+        x = torch.nn.functional.gelu(self.conv2(x * mask))
+        x_len = (x_len + 2 - 1 * (3 - 1) - 1) // 2 + 1
+        mask = make_non_pad_mask(x_len).unsqueeze(1)
+        x = x.permute(0, 2, 1)  # (B, T // 2, n_state)
+        freqs_cis = self.freqs_cis.to(x.device)
+        mask_pad = mask.transpose(1, 2)
+        mask = mask_to_bias(mask, x.dtype)
+        tmp = torch.view_as_real(freqs_cis)
+        cos, sin = tmp[:, :, 0], tmp[:, :, 1]
+        cos = torch.cat((cos, cos), dim=-1)
+        sin = torch.cat((sin, sin), dim=-1)
+        cos = cos.unsqueeze(0).unsqueeze(2)
+        sin = sin.unsqueeze(0).unsqueeze(2)
+        for block in self.blocks:
+            x = block(x, mask.unsqueeze(1), mask_pad, freqs_cis[:x.size(1)])
+        return x, x_len
+class S3TokenizerV2(torch.nn.Module):
+    """S3 tokenizer v2 implementation (inference-only).
+    Args:
+        config (ModelConfig): Config
+    """
+    def __init__(self, name: str, config: ModelConfig = ModelConfig()):
+        super().__init__()
+        self.name = name  # Store model name for token_rate determination
+        if 'v1' not in name:
+            assert 'v2' in name
+            # TODO(Mddct): make it configureable
+            config.n_codebook_size = 3**8
+        self.config = config
+        self.encoder = AudioEncoderV2(
+            self.config.n_mels,
+            self.config.n_audio_state,
+            self.config.n_audio_head,
+            self.config.n_audio_layer,
+            2,
+            self.config.use_sdpa,
+        )
+        self.quantizer = FSQVectorQuantization(
+            self.config.n_audio_state,
+            self.config.n_codebook_size,
+        )
+    def forward(self, mel: torch.Tensor,
+                mel_len: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        return self.quantize(mel, mel_len)
+    @torch.inference_mode()
+    def quantize(self, mel: torch.Tensor,
+                 mel_len: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Quantize mel spectrogram to tokens, with automatic long audio handling.
+        Args:
+            mel: mel spectrogram tensor, shape (batch_size, n_mels, T)
+            mel_len: mel length tensor, shape (batch_size,)
+        Returns:
+            code: quantized tokens, shape (batch_size, T')
+            code_len: token length, shape (batch_size,)
+        """
+        # Check if any audio in the batch exceeds 30 seconds
+        # Assuming 16kHz sample rate and hop_length=160, 30s = 30*16000/160 = 3000 frames
+        max_frames = 3000
+        # Check which samples are long audio
+        long_audio_mask = mel_len > max_frames
+        if long_audio_mask.any():
+            # Has long audio - need special processing
+            return self._quantize_mixed_batch(mel, mel_len, long_audio_mask,
+                                              max_frames)
+        else:
+            # All short audio - use original method
+            hidden, code_len = self.encoder(mel, mel_len)
+            code = self.quantizer.encode(hidden)
+            return code, code_len
+    @torch.inference_mode()
+    def _quantize_mixed_batch(
+            self, mel: torch.Tensor, mel_len: torch.Tensor,
+            long_audio_mask: torch.Tensor,
+            max_frames: int) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Handle mixed batch with both short and long audio using unified batch processing.
+        Args:
+            mel: mel spectrogram tensor, shape (batch_size, n_mels, T)
+            mel_len: mel length tensor, shape (batch_size,)
+            long_audio_mask: boolean mask for long audio, shape (batch_size,)
+            max_frames: maximum frames for short audio
+        Returns:
+            code: quantized tokens, shape (batch_size, T')
+            code_len: token length, shape (batch_size,)
+        """
+        batch_size = mel.size(0)
+        # Parameters for sliding window
+        sample_rate = 16000
+        hop_length = 160  # Default hop length for mel spectrogram
+        window_size = 30  # seconds
+        overlap = 4  # seconds
+        # Calculate frame-based parameters
+        frames_per_window = window_size * sample_rate // hop_length  # 3000 frames
+        frames_per_overlap = overlap * sample_rate // hop_length  # 400 frames
+        frames_per_stride = frames_per_window - frames_per_overlap  # 2600 frames
+        # Collect all segments to process (including short and long audio segments)
+        all_segments = []
+        all_segments_len = []
+        segment_info = [
+        ]  # Record which audio each segment belongs to and whether it's long audio
+        # Process all audio in the batch
+        for batch_idx in range(batch_size):
+            audio_mel = mel[batch_idx]
+            audio_mel_len = mel_len[batch_idx]
+            is_long_audio = long_audio_mask[batch_idx].item()
+            if not is_long_audio:
+                # Short audio: process directly as a single segment
+                segment = audio_mel[:, :audio_mel_len]
+                seg_len = audio_mel_len.item()
+                # Pad to max_frames if necessary
+                if seg_len < frames_per_window:
+                    pad_size = frames_per_window - seg_len
+                    segment = torch.nn.functional.pad(segment, (0, pad_size))
+                all_segments.append(segment)
+                all_segments_len.append(
+                    torch.tensor(seg_len, device=mel.device))
+                segment_info.append({
+                    'batch_idx': batch_idx,
+                    'is_long_audio': False,
+                    'segment_idx': 0,
+                    'total_segments': 1
+                })
+            else:
+                # Long audio: split into multiple segments
+                start = 0
+                segment_idx = 0
+                while start < audio_mel_len:
+                    end = min(start + frames_per_window, audio_mel_len)
+                    segment = audio_mel[:, start:end]
+                    seg_len = segment.size(1)
+                    # Pad if necessary
+                    if seg_len < frames_per_window:
+                        pad_size = frames_per_window - seg_len
+                        segment = torch.nn.functional.pad(
+                            segment, (0, pad_size))
+                    all_segments.append(segment)
+                    all_segments_len.append(
+                        torch.tensor(seg_len, device=mel.device))
+                    segment_info.append({
+                        'batch_idx': batch_idx,
+                        'is_long_audio': True,
+                        'segment_idx': segment_idx,
+                        'total_segments': None  # Will be filled later
+                    })
+                    segment_idx += 1
+                    start += frames_per_stride
+                # Update total_segments info
+                total_segments = segment_idx
+                for info in segment_info:
+                    if info['batch_idx'] == batch_idx and info['is_long_audio']:
+                        info['total_segments'] = total_segments
+        if not all_segments:
+            # Fallback if no segments
+            return torch.zeros(batch_size,
+                               0,
+                               dtype=torch.long,
+                               device=mel.device), torch.zeros(
+                                   batch_size,
+                                   dtype=torch.long,
+                                   device=mel.device)
+        # Unified batch processing for all segments
+        unified_batch_mel = torch.stack(all_segments)
+        unified_batch_lens = torch.stack(all_segments_len)
+        # Process all segments at once
+        hidden, code_len = self.encoder(unified_batch_mel, unified_batch_lens)
+        codes = self.quantizer.encode(hidden)
+        # Reorganize results based on segment_info
+        results = {}  # batch_idx -> (code_tensor, code_len)
+        for seg_idx, info in enumerate(segment_info):
+            batch_idx = info['batch_idx']
+            is_long_audio = info['is_long_audio']
+            segment_idx = info['segment_idx']
+            # Get codes for current segment
+            segment_code = codes[
+                seg_idx, :code_len[seg_idx].item()].cpu().numpy().tolist()
+            if not is_long_audio:
+                # Short audio: use directly
+                code_tensor = torch.tensor(segment_code,
+                                           dtype=torch.long,
+                                           device=mel.device)
+                results[batch_idx] = (code_tensor, len(segment_code))
+            else:
+                # Long audio: collect all segments
+                if batch_idx not in results:
+                    results[batch_idx] = []
+                results[batch_idx].append(segment_code)
+        # Process long audio segment merging
+        for batch_idx in range(batch_size):
+            if long_audio_mask[batch_idx].item():
+                # Merge long audio segments
+                audio_codes = results[batch_idx]
+                # V2 models use 25Hz token rate
+                token_rate = 25
+                merged_codes = merge_tokenized_segments(audio_codes,
+                                                        overlap=overlap,
+                                                        token_rate=token_rate)
+                # Convert to tensor
+                merged_codes_tensor = torch.tensor(merged_codes,
+                                                   dtype=torch.long,
+                                                   device=mel.device)
+                results[batch_idx] = (merged_codes_tensor, len(merged_codes))
+        # Construct final output
+        max_code_len = max(code_info[1] for code_info in results.values())
+        output_codes = torch.zeros(batch_size,
+                                   max_code_len,
+                                   dtype=torch.long,
+                                   device=mel.device)
+        output_codes_len = torch.zeros(batch_size,
+                                       dtype=torch.long,
+                                       device=mel.device)
+        for batch_idx, (code_tensor, code_len) in results.items():
+            output_codes[batch_idx, :code_len] = code_tensor
+            output_codes_len[batch_idx] = code_len
+        return output_codes, output_codes_len
+    @property
+    def device(self):
+        return next(self.parameters()).device
+    def init_from_onnx(self, onnx_path: str):
+        ckpt = onnx2torch(onnx_path, None, False)
+        self.load_state_dict(ckpt, strict=True)
+    def init_from_pt(self, ckpt_path: str):
+        ckpt = torch.load(ckpt_path, map_location="cpu", mmap=True)
+        self.load_state_dict(ckpt, strict=True)
+    def freeze(self):
+        for _, param in self.named_parameters():
+            param.requires_grad = False

speech/tools/S3Tokenizer/s3tokenizer/utils.py ADDED Viewed

	@@ -0,0 +1,390 @@

+# Copyright (c) 2023 OpenAI. (authors: Whisper Team)
+#               2024 Tsinghua Univ. (authors: Xingchen Song)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Modified from https://github.com/openai/whisper/blob/main/whisper/audio.py
+   Add rename_weights() & onnx2torch() & make_non_pad_mask() & mask_to_bias()
+   Copy merge_tokenized_segments() from https://github.com/Mddct/s3tokenizer-long/blob/main/example.py
+"""
+import os
+from functools import lru_cache
+from typing import List, Optional, Union
+import numpy as np
+import onnx
+import torch
+import torch.nn.functional as F
+import torchaudio
+from torch.nn.utils.rnn import pad_sequence
+def _rename_weights(weights_dict: dict):
+    """
+    Rename onnx weights to pytorch format.
+    Parameters
+    ----------
+    weight_dict: dict
+        The dict containing weights in onnx format
+    Returns
+    -------
+    A new weight dict containing the weights in pytorch format.
+    """
+    new_weight_dict = {}
+    for k in weights_dict.keys():
+        if "quantizer" in k:  # vq or fsq
+            if k == "/quantizer/rq/model/layers.0/_codebook/Pow_1":
+                new_weight_dict["quantizer._codebook.embed"] = weights_dict[k]
+            elif 'project_down' in k:  # v2
+                new_weight_dict[k] = weights_dict[k]
+        elif "positional_embedding" in k:  # positional emb
+            new_weight_dict[k] = weights_dict[k]
+        elif "conv" in k:  # 1/2 or 1/4 subsample
+            new_weight_dict[k] = weights_dict[k]
+        else:  # transformer blocks
+            assert "blocks" in k
+            new_k = (k[1:].replace('/', '.').replace(
+                'MatMul', 'weight').replace('Add_1', 'bias').replace(
+                    'Mul', 'weight').replace('Add', 'bias').replace(
+                        'mlp.mlp', 'mlp')).replace('fsmn_block.Conv',
+                                                   'fsmn_block.weight')
+            new_weight_dict[f"encoder.{new_k}"] = weights_dict[k]
+    return new_weight_dict
+def onnx2torch(onnx_path: str, torch_path: str = None, verbose: bool = False):
+    """
+    Open an onnx file and convert to pytorch format.
+    Parameters
+    ----------
+    onnx_path: str
+        The onnx file to open, typically `speech_tokenizer_v1.onnx`
+    torch_path: str
+        The path to save the torch-formated checkpoint.
+    verbose: bool
+        Logging info or not.
+    Returns
+    -------
+    A checkpoint dict containing the weights and their names, if torch_path is
+    None. Otherwise save checkpoint dict to the desired path.
+    """
+    onnx_model = onnx.load(onnx_path)
+    weights_dict = {}
+    initializer_map = {
+        initializer.name: initializer
+        for initializer in onnx_model.graph.initializer
+    }
+    for node in onnx_model.graph.node:
+        for input_name in node.input:
+            if input_name in initializer_map:
+                ln_bias_name, ln_weight_name = None, None  # for v2 ln
+                initializer = initializer_map[input_name]
+                if input_name in [
+                        "onnx::Conv_1519",
+                        "encoders.conv1.weight",
+                        "onnx::Conv_2216",
+                ]:  # v1_50hz, v1_25hz, v2_25hz
+                    weight_name = "encoder.conv1.weight"
+                elif input_name in [
+                        "onnx::Conv_1520",
+                        "encoders.conv1.bias",
+                        "onnx::Conv_2217",
+                ]:  # v1_50hz, v1_25hz, v2_25hz
+                    weight_name = "encoder.conv1.bias"
+                elif input_name in [
+                        "onnx::Conv_1521",
+                        "encoders.conv2.weight",
+                        "onnx::Conv_2218",
+                ]:
+                    weight_name = "encoder.conv2.weight"
+                elif input_name in [
+                        "onnx::Conv_1522",
+                        "encoders.conv2.bias",
+                        "onnx::Conv_2219",
+                ]:
+                    weight_name = "encoder.conv2.bias"
+                elif input_name == "encoders.positional_embedding":
+                    weight_name = "encoder.positional_embedding"
+                elif input_name == 'quantizer.project_in.bias':
+                    weight_name = "quantizer._codebook.project_down.bias"
+                elif input_name == 'onnx::MatMul_2536':
+                    weight_name = "quantizer._codebook.project_down.weight"
+                else:
+                    if node.op_type == 'LayerNormalization':  # in input_name:
+                        ln_name = node.name.replace('/LayerNormalization', '')
+                        ln_weight_name = ln_name + '.weight'
+                        ln_bias_name = ln_name + '.bias'
+                    else:
+                        weight_name = node.name
+                if ln_weight_name is not None and ln_bias_name is not None:
+                    ln_inputs = node.input
+                    scale_name = ln_inputs[1]
+                    bias_name = ln_inputs[2]
+                    scale = onnx.numpy_helper.to_array(
+                        initializer_map[scale_name]).copy(
+                        ) if scale_name in initializer_map else None
+                    bias = onnx.numpy_helper.to_array(
+                        initializer_map[bias_name]).copy(
+                        ) if bias_name in initializer_map else None
+                    scale.flags.writeable = True
+                    bias.flags.writeable = True
+                    weight_tensor = torch.from_numpy(scale)
+                    bias_tensor = torch.from_numpy(bias)
+                    weights_dict[ln_bias_name] = bias_tensor
+                    weights_dict[ln_weight_name] = weight_tensor
+                else:
+                    weight_array = onnx.numpy_helper.to_array(
+                        initializer).copy()
+                    weight_array.flags.writeable = True
+                    weight_tensor = torch.from_numpy(weight_array)
+                    if len(weight_tensor.shape) > 2 or weight_name in [
+                            "encoder.positional_embedding"
+                    ]:
+                        weights_dict[weight_name] = weight_tensor
+                    else:
+                        weights_dict[weight_name] = weight_tensor.t()
+    new_weights_dict = _rename_weights(weights_dict)
+    if verbose:
+        for k, v in new_weights_dict.items():
+            print(f"{k} : {v.shape} {v.dtype}")
+        print(f"PyTorch weights saved to {torch_path}")
+    del weights_dict, onnx_model
+    if torch_path:
+        torch.save(new_weights_dict, torch_path)
+    else:
+        return new_weights_dict
+def load_audio(file: str, sr: int = 16000):
+    """
+    Open an audio file and read as mono waveform, resampling as necessary
+    Parameters
+    ----------
+    file: str
+        The audio file to open
+    sr: int
+        The sample rate to resample the audio if necessary
+    Returns
+    -------
+    A torch.Tensor containing the audio waveform, in float32 dtype.
+    """
+    audio, sample_rate = torchaudio.load(file)
+    if sample_rate != sr:
+        audio = torchaudio.transforms.Resample(sample_rate, sr)(audio)
+    audio = audio[0]  # get the first channel
+    return audio
+@lru_cache(maxsize=None)
+def _mel_filters(device, n_mels: int) -> torch.Tensor:
+    """
+    load the mel filterbank matrix for projecting STFT into a Mel spectrogram.
+    Allows decoupling librosa dependency; saved using:
+        np.savez_compressed(
+            "mel_filters.npz",
+            mel_80=librosa.filters.mel(sr=16000, n_fft=400, n_mels=80),
+            mel_128=librosa.filters.mel(sr=16000, n_fft=400, n_mels=128),
+        )
+    """
+    assert n_mels in {80, 128}, f"Unsupported n_mels: {n_mels}"
+    filters_path = os.path.join(os.path.dirname(__file__), "assets",
+                                "mel_filters.npz")
+    with np.load(filters_path, allow_pickle=False) as f:
+        return torch.from_numpy(f[f"mel_{n_mels}"]).to(device)
+def log_mel_spectrogram(
+    audio: Union[str, np.ndarray, torch.Tensor],
+    n_mels: int = 128,
+    padding: int = 0,
+    device: Optional[Union[str, torch.device]] = None,
+):
+    """
+    Compute the log-Mel spectrogram of
+    Parameters
+    ----------
+    audio: Union[str, np.ndarray, torch.Tensor], shape = (*)
+        The path to audio or either a NumPy array or Tensor containing the
+        audio waveform in 16 kHz
+    n_mels: int
+        The number of Mel-frequency filters, only 80 is supported
+    padding: int
+        Number of zero samples to pad to the right
+    device: Optional[Union[str, torch.device]]
+        If given, the audio tensor is moved to this device before STFT
+    Returns
+    -------
+    torch.Tensor, shape = (128, n_frames)
+        A Tensor that contains the Mel spectrogram
+    """
+    if not torch.is_tensor(audio):
+        if isinstance(audio, str):
+            audio = load_audio(audio)
+    if device is not None:
+        audio = audio.to(device)
+    if padding > 0:
+        audio = F.pad(audio, (0, padding))
+    window = torch.hann_window(400).to(audio.device)
+    stft = torch.stft(audio, 400, 160, window=window, return_complex=True)
+    magnitudes = stft[..., :-1].abs()**2
+    filters = _mel_filters(audio.device, n_mels)
+    mel_spec = filters @ magnitudes
+    log_spec = torch.clamp(mel_spec, min=1e-10).log10()
+    log_spec = torch.maximum(log_spec, log_spec.max() - 8.0)
+    log_spec = (log_spec + 4.0) / 4.0
+    return log_spec
+def make_non_pad_mask(lengths: torch.Tensor, max_len: int = 0) -> torch.Tensor:
+    """Make mask tensor containing indices of non-padded part.
+    The sequences in a batch may have different lengths. To enable
+    batch computing, padding is need to make all sequence in same
+    size. To avoid the padding part pass value to context dependent
+    block such as attention or convolution , this padding part is
+    masked.
+    1 for non-padded part and 0 for padded part.
+    Parameters
+    ----------
+        lengths (torch.Tensor): Batch of lengths (B,).
+    Returns:
+    -------
+        torch.Tensor: Mask tensor containing indices of padded part (B, max_T).
+    Examples:
+        >>> import torch
+        >>> import s3tokenizer
+        >>> lengths = torch.tensor([5, 3, 2])
+        >>> masks = s3tokenizer.make_non_pad_mask(lengths)
+        masks = [[1, 1, 1, 1, 1],
+                 [1, 1, 1, 0, 0],
+                 [1, 1, 0, 0, 0]]
+    """
+    batch_size = lengths.size(0)
+    max_len = max_len if max_len > 0 else lengths.max().item()
+    seq_range = torch.arange(0,
+                             max_len,
+                             dtype=torch.int64,
+                             device=lengths.device)
+    seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len)
+    seq_length_expand = lengths.unsqueeze(-1)
+    mask = seq_range_expand >= seq_length_expand
+    return ~mask
+def mask_to_bias(mask: torch.Tensor, dtype: torch.dtype) -> torch.Tensor:
+    """Convert bool-tensor to float-tensor for flash attention.
+    Parameters
+    ----------
+        lengths (torch.Tensor): Batch of lengths (B, ?).
+    Returns:
+    -------
+        torch.Tensor: Mask tensor containing indices of padded part (B, ?).
+    Examples:
+        >>> import torch
+        >>> import s3tokenizer
+        >>> lengths = torch.tensor([5, 3, 2])
+        >>> masks = s3tokenizer.make_non_pad_mask(lengths)
+        masks = [[1, 1, 1, 1, 1],
+                 [1, 1, 1, 0, 0],
+                 [1, 1, 0, 0, 0]]
+        >>> new_masks = s3tokenizer.mask_to_bias(masks, torch.float32)
+        new_masks =
+            [[-0.0000e+00, -0.0000e+00, -0.0000e+00, -0.0000e+00, -0.0000e+00],
+             [-0.0000e+00, -0.0000e+00, -0.0000e+00, -1.0000e+10, -1.0000e+10],
+             [-0.0000e+00, -0.0000e+00, -1.0000e+10, -1.0000e+10, -1.0000e+10]]
+    """
+    assert mask.dtype == torch.bool
+    assert dtype in [torch.float32, torch.bfloat16, torch.float16]
+    mask = mask.to(dtype)
+    # attention mask bias
+    # NOTE(Mddct): torch.finfo jit issues
+    #     chunk_masks = (1.0 - chunk_masks) * torch.finfo(dtype).min
+    mask = (1.0 - mask) * -1.0e+10
+    return mask
+def padding(data: List[torch.Tensor]):
+    """ Padding the data into batch data
+    Parameters
+    ----------
+        data: List[Tensor], shape of Tensor (128, T)
+    Returns:
+    -------
+        feats [B, 128, T_max], feats lengths [B]
+    """
+    sample = data
+    assert isinstance(sample, list)
+    feats_lengths = torch.tensor([s.size(1) for s in sample],
+                                 dtype=torch.int32)
+    feats = [s.t() for s in sample]
+    padded_feats = pad_sequence(feats, batch_first=True, padding_value=0)
+    return padded_feats.transpose(1, 2), feats_lengths
+def merge_tokenized_segments(tokenized_segments, overlap, token_rate):
+    """
+    Merges tokenized outputs by keeping the middle and dropping half of the overlapped tokens.
+    Args:
+    - tokenized_segments (List[List[int]]): List of tokenized sequences.
+    - overlap (int): Overlapping duration in seconds (default: 4s).
+    - token_rate (int): Number of tokens per second.
+    Returns:
+    - List[int]: A single merged token sequence.
+    """
+    merged_tokens = []
+    overlap_tokens = (
+        overlap //
+        2) * token_rate  # Tokens corresponding to half of the overlap duration
+    for i, tokens in enumerate(tokenized_segments):
+        l = 0 if i == 0 else overlap_tokens
+        r = -overlap_tokens if i != len(tokenized_segments) - 1 else len(tokens)
+        # Keep only the middle part (drop overlap / 2 from both sides)
+        merged_tokens.extend(tokens[l:r])
+    return merged_tokens

speech/tools/S3Tokenizer/setup.py ADDED Viewed

	@@ -0,0 +1,37 @@

+from pathlib import Path
+from setuptools import find_packages, setup
+def parse_requirements(filename):
+    """Load requirements from a pip requirements file."""
+    with open(filename, 'r') as file:
+        lines = (line.strip() for line in file)
+        return [line for line in lines if line and not line.startswith('#')]
+setup(
+    name="s3tokenizer",
+    version="0.2.0",
+    description=\
+    "Reverse Engineering of Supervised Semantic Speech Tokenizer (S3Tokenizer) proposed in CosyVoice",  # noqa
+    long_description=open("README.md", encoding="utf-8").read(),
+    long_description_content_type="text/markdown",
+    python_requires=">=3.8",
+    author="xingchensong",
+    url="https://github.com/xingchensong/S3Tokenizer",
+    license="Apache2.0",
+    packages=find_packages(),
+    install_requires=parse_requirements(
+        Path(__file__).with_name("requirements.txt")),
+    entry_points={
+        "console_scripts": ["s3tokenizer=s3tokenizer.cli:main"],
+    },
+    include_package_data=True,
+    extras_require={"dev": ["pytest", "scipy", "black", "flake8", "isort"]},
+    classifiers=[
+        "Programming Language :: Python :: 3",
+        "Operating System :: OS Independent",
+        "Topic :: Scientific/Engineering",
+    ],
+)

speech/tools/S3Tokenizer/test/test_batch_efficiency.py ADDED Viewed

	@@ -0,0 +1,272 @@

+#!/usr/bin/env python3
+"""
+Batch processing efficiency test
+Test the efficiency improvement of new batch processing functionality for mixed long and short audio
+"""
+import time
+import torch
+import pytest
+import s3tokenizer
+def create_test_audio(duration_seconds=20, sample_rate=16000):
+    """Create test audio"""
+    length = int(duration_seconds * sample_rate)
+    # Create meaningful audio signal (sine wave mixture)
+    t = torch.linspace(0, duration_seconds, length)
+    audio = 0.5 * torch.sin(2 * torch.pi * 440 * t)  # 440Hz fundamental
+    audio += 0.3 * torch.sin(2 * torch.pi * 880 * t)  # 880Hz second harmonic
+    audio += 0.1 * torch.randn(length)  # Add some noise
+    return audio
+@pytest.fixture
+def test_audios():
+    """Create test audio dataset"""
+    return [
+        create_test_audio(10),  # Short audio
+        create_test_audio(20),  # Medium audio
+        create_test_audio(40),  # Long audio
+        create_test_audio(60),  # Long audio
+        create_test_audio(15),  # Short audio
+        create_test_audio(35),  # Long audio
+        create_test_audio(25),  # Medium audio
+        create_test_audio(50),  # Long audio
+    ]
+@pytest.fixture
+def long_audios():
+    """Create long audio dataset"""
+    return [
+        create_test_audio(45.5),
+        create_test_audio(60),
+        create_test_audio(91.2),
+        create_test_audio(120),
+    ]
+@pytest.mark.parametrize("model_name", [
+    "speech_tokenizer_v1_25hz", "speech_tokenizer_v1",
+    "speech_tokenizer_v2_25hz"
+])
+def test_batch_efficiency(test_audios, model_name):
+    """Test batch processing efficiency for different models"""
+    print(f"\n=== Batch Processing Efficiency Test for {model_name} ===")
+    # Load model
+    model = s3tokenizer.load_model(model_name)
+    model.eval()
+    # Method 1: Individual processing
+    print(f"\n--- Method 1: Individual Processing ({model_name}) ---")
+    start_time = time.time()
+    individual_results = []
+    for i, audio in enumerate(test_audios):
+        mel = s3tokenizer.log_mel_spectrogram(audio)
+        mels = mel.unsqueeze(0)
+        mels_lens = torch.tensor([mel.size(1)])
+        with torch.no_grad():
+            codes, codes_lens = model.quantize(mels, mels_lens)
+        final_codes = codes[0, :codes_lens[0].item()].tolist()
+        individual_results.append(final_codes)
+        duration = audio.shape[0] / 16000
+        processing_type = "Long audio" if duration > 30 else "Short audio"
+        print(
+            f"Audio {i+1}: {duration:.1f}s, {len(final_codes)} tokens, {processing_type}"
+        )
+    individual_time = time.time() - start_time
+    print(f"Individual processing total time: {individual_time:.2f}s")
+    # Method 2: Batch processing
+    print(f"\n--- Method 2: Batch Processing ({model_name}) ---")
+    start_time = time.time()
+    # Prepare batch input
+    mels = []
+    for audio in test_audios:
+        mel = s3tokenizer.log_mel_spectrogram(audio)
+        mels.append(mel)
+    # Use padding to handle different lengths of mel
+    mels, mels_lens = s3tokenizer.padding(mels)
+    # Batch processing
+    with torch.no_grad():
+        codes, codes_lens = model.quantize(mels, mels_lens)
+    # Process results
+    batch_results = []
+    for i in range(len(test_audios)):
+        final_codes = codes[i, :codes_lens[i].item()].tolist()
+        batch_results.append(final_codes)
+        duration = test_audios[i].shape[0] / 16000
+        processing_type = "Long audio" if duration > 30 else "Short audio"
+        print(
+            f"Audio {i+1}: {duration:.1f}s, {len(final_codes)} tokens, {processing_type}"
+        )
+    batch_time = time.time() - start_time
+    print(f"Batch processing total time: {batch_time:.2f}s")
+    # Verify result consistency
+    print(f"\n--- Result Verification for {model_name} ---")
+    all_ok = True
+    for i in range(len(test_audios)):
+        individual_tokens = individual_results[i]
+        batch_tokens = batch_results[i]
+        # Calculate miss rate
+        if len(individual_tokens) != len(batch_tokens):
+            print(
+                f"❌ Audio {i+1} length mismatch: individual={len(individual_tokens)}, batch={len(batch_tokens)}"
+            )
+            all_ok = False
+        else:
+            mismatches = sum(1 for a, b in zip(individual_tokens, batch_tokens)
+                             if a != b)
+            miss_rate = mismatches / len(individual_tokens) * 100 if len(
+                individual_tokens) > 0 else 0
+            if miss_rate < 0.2:  # Less than 0.2% is considered OK
+                print(f"✅ Audio {i+1} miss rate: {miss_rate:.4f}% (OK)")
+            else:
+                print(f"❌ Audio {i+1} miss rate: {miss_rate:.4f}% (Too high)")
+                all_ok = False
+    # Efficiency improvement
+    speedup = individual_time / batch_time
+    print(f"\n--- Efficiency Improvement for {model_name} ---")
+    print(f"Batch processing speedup: {speedup:.2f}x")
+    if speedup > 1:
+        print("✅ Batch processing indeed improves efficiency!")
+    else:
+        print("⚠️  Batch processing doesn't significantly improve efficiency")
+    # Assertions for pytest
+    assert all_ok, f"Results don't match for model {model_name}"
+    assert len(individual_results) == len(
+        batch_results), "Number of results don't match"
+    assert all(
+        len(individual_results[i]) == len(batch_results[i])
+        for i in range(len(test_audios))), "Token counts don't match"
+    # Performance assertion - batch should be at least as fast as individual (allowing for some variance)
+    # assert batch_time <= individual_time * 1.1, f"Batch processing should not be significantly slower than individual processing for {model_name}"
+@pytest.mark.parametrize("model_name", [
+    "speech_tokenizer_v1_25hz", "speech_tokenizer_v1",
+    "speech_tokenizer_v2_25hz"
+])
+def test_pure_long_audio_batch(long_audios, model_name):
+    """Test pure long audio batch processing for different models"""
+    print(f"\n=== Pure Long Audio Batch Processing Test for {model_name} ===")
+    model = s3tokenizer.load_model(model_name)
+    model.eval()
+    # Prepare batch input
+    mels = []
+    for audio in long_audios:
+        mel = s3tokenizer.log_mel_spectrogram(audio)
+        mels.append(mel)
+    mels, mels_lens = s3tokenizer.padding(mels)
+    # Batch process long audio
+    start_time = time.time()
+    with torch.no_grad():
+        codes, codes_lens = model.quantize(mels, mels_lens)
+    processing_time = time.time() - start_time
+    print(
+        f"Batch processing {len(long_audios)} long audios took: {processing_time:.2f}s"
+    )
+    results = []
+    for i in range(len(long_audios)):
+        duration = long_audios[i].shape[0] / 16000
+        tokens_count = codes_lens[i].item()
+        results.append((duration, tokens_count))
+        print(f"Long audio {i+1}: {duration:.1f}s → {tokens_count} tokens")
+    print(
+        f"✅ Pure long audio batch processing test completed for {model_name}")
+    # Assertions for pytest
+    assert codes is not None, f"Codes should not be None for model {model_name}"
+    assert codes_lens is not None, f"Codes lengths should not be None for model {model_name}"
+    assert len(results) == len(
+        long_audios), "Number of results should match number of input audios"
+    assert all(
+        tokens_count > 0
+        for _, tokens_count in results), "All audio should produce tokens"
+    assert processing_time > 0, "Processing time should be positive"
+@pytest.mark.parametrize("model_name", [
+    "speech_tokenizer_v1_25hz", "speech_tokenizer_v1",
+    "speech_tokenizer_v2_25hz"
+])
+def test_model_loading(model_name):
+    """Test that all models can be loaded successfully"""
+    print(f"\n=== Model Loading Test for {model_name} ===")
+    model = s3tokenizer.load_model(model_name)
+    assert model is not None, f"Model {model_name} should load successfully"
+    # Test model can be set to eval mode
+    model.eval()
+    print(f"✅ Model {model_name} loaded and set to eval mode successfully")
+@pytest.mark.parametrize("model_name", [
+    "speech_tokenizer_v1_25hz", "speech_tokenizer_v1",
+    "speech_tokenizer_v2_25hz"
+])
+def test_single_audio_processing(model_name):
+    """Test single audio processing for different models"""
+    print(f"\n=== Single Audio Processing Test for {model_name} ===")
+    # Create a single test audio
+    audio = create_test_audio(30)  # 30 second audio
+    model = s3tokenizer.load_model(model_name)
+    model.eval()
+    # Process the audio
+    mel = s3tokenizer.log_mel_spectrogram(audio)
+    mels = mel.unsqueeze(0)
+    mels_lens = torch.tensor([mel.size(1)])
+    with torch.no_grad():
+        codes, codes_lens = model.quantize(mels, mels_lens)
+    final_codes = codes[0, :codes_lens[0].item()].tolist()
+    # Assertions
+    assert codes is not None, f"Codes should not be None for model {model_name}"
+    assert codes_lens is not None, f"Codes lengths should not be None for model {model_name}"
+    assert len(
+        final_codes) > 0, f"Should produce tokens for model {model_name}"
+    assert codes_lens[0].item() == len(
+        final_codes
+    ), f"Codes length should match actual codes for model {model_name}"
+    duration = audio.shape[0] / 16000
+    print(
+        f"✅ Single audio processing test completed for {model_name}: {duration:.1f}s → {len(final_codes)} tokens"
+    )
+if __name__ == "__main__":
+    # Run tests with pytest
+    pytest.main([__file__, "-v"])

speech/tools/S3Tokenizer/test/test_onnx.py ADDED Viewed

	@@ -0,0 +1,377 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# Copyright [2024-09-27] <sxc19@mails.tsinghua.edu.cn, Xingchen Song>
+import os
+import time
+from typing import Dict, Any
+import numpy as np
+import onnxruntime
+import pytest
+import s3tokenizer
+import torch
+def create_test_audio(duration_seconds: float = 20,
+                      sample_rate: int = 16000) -> torch.Tensor:
+    """Create synthetic test audio"""
+    length = int(duration_seconds * sample_rate)
+    # Create sinusoidal mixed audio
+    t = torch.linspace(0, duration_seconds, length)
+    audio = 0.5 * torch.sin(2 * torch.pi * 440 * t)  # 440Hz fundamental
+    audio += 0.3 * torch.sin(2 * torch.pi * 880 * t)  # 880Hz second harmonic
+    audio += 0.1 * torch.randn(length)  # Add noise
+    return audio
+@pytest.fixture
+def test_audio_suite():
+    """Create a suite of test audios with different lengths"""
+    return {
+        "short_audio_1": create_test_audio(5.0),  # 5 seconds
+        "short_audio_2": create_test_audio(15.0),  # 15 seconds
+        "medium_audio": create_test_audio(25.0),  # 25 seconds
+        "medium_audio_2": create_test_audio(30.0),  # 30 seconds
+        "long_audio": create_test_audio(
+            35.0),  # 35 seconds - for torch and onnx, 2 segments with padding
+        "long_audio_2": create_test_audio(
+            56.0
+        ),  # 56 seconds - for torch and onnx, exactly 2 segments without padding
+        "very_long_audio": create_test_audio(
+            60.0),  # 60 seconds - for torch and onnx, 3 segments with padding
+    }
+def onnx_inference_short_audio(model_name: str, mel: torch.Tensor,
+                               mel_len: torch.Tensor) -> torch.Tensor:
+    """
+    ONNX inference for short audio (<=30s)
+    """
+    # Load ONNX model
+    default = os.path.join(os.path.expanduser("~"), ".cache")
+    download_root = os.path.join(os.getenv("XDG_CACHE_HOME", default),
+                                 "s3tokenizer")
+    option = onnxruntime.SessionOptions()
+    option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
+    option.intra_op_num_threads = 1
+    providers = ["CPUExecutionProvider"]
+    ort_session = onnxruntime.InferenceSession(
+        f"{download_root}/{model_name}.onnx",
+        sess_options=option,
+        providers=providers)
+    # Direct inference for short audio
+    onnx_output = ort_session.run(
+        None, {
+            ort_session.get_inputs()[0].name:
+            mel[:, :mel_len.item()].unsqueeze(0).detach().cpu().numpy(),
+            ort_session.get_inputs()[1].name:
+            np.array([mel_len.item()], dtype=np.int32)
+        })[0]
+    # Convert to numpy array to fix linter issues
+    onnx_output = np.array(onnx_output)
+    # Handle different output formats
+    if onnx_output.ndim == 2:
+        onnx_output = onnx_output[0, :]
+    elif onnx_output.ndim == 3:
+        onnx_output = onnx_output[0, 0, :]
+    return torch.tensor(onnx_output, dtype=torch.long)
+def onnx_inference_long_audio(model_name: str, mel: torch.Tensor,
+                              mel_len: torch.Tensor) -> torch.Tensor:
+    """
+    ONNX inference for long audio (>30s) using sliding window approach
+    Based on _quantize_mixed_batch logic
+    Note: This may fail due to ONNX model limitations with dynamic lengths
+    """
+    # Load ONNX model
+    default = os.path.join(os.path.expanduser("~"), ".cache")
+    download_root = os.path.join(os.getenv("XDG_CACHE_HOME", default),
+                                 "s3tokenizer")
+    option = onnxruntime.SessionOptions()
+    option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
+    option.intra_op_num_threads = 1
+    providers = ["CPUExecutionProvider"]
+    ort_session = onnxruntime.InferenceSession(
+        f"{download_root}/{model_name}.onnx",
+        sess_options=option,
+        providers=providers)
+    # Parameters for sliding window (same as _quantize_mixed_batch)
+    sample_rate = 16000
+    hop_length = 160
+    window_size = 30  # seconds
+    overlap = 4  # seconds
+    # Calculate frame-based parameters
+    frames_per_window = window_size * sample_rate // hop_length  # 3000 frames
+    frames_per_overlap = overlap * sample_rate // hop_length  # 400 frames
+    frames_per_stride = frames_per_window - frames_per_overlap  # 2600 frames
+    # Split into segments
+    segments = []
+    segments_len = []
+    start = 0
+    while start < mel_len.item():
+        end = min(start + frames_per_window, mel_len.item())
+        segment = mel[:, start:end]
+        if segment.size(1) < frames_per_window:
+            break
+        seg_len = segment.size(1)
+        segments.append(segment)
+        segments_len.append(seg_len)
+        start += frames_per_stride
+    if not segments:
+        raise ValueError("No valid segments for ONNX processing")
+    # Process each segment with ONNX
+    segment_results = []
+    for i, (segment, seg_len) in enumerate(zip(segments, segments_len)):
+        try:
+            onnx_output = ort_session.run(
+                None, {
+                    ort_session.get_inputs()[0].name:
+                    segment.unsqueeze(0).detach().cpu().numpy(),
+                    ort_session.get_inputs()[1].name:
+                    np.array([seg_len], dtype=np.int32)
+                })[0]
+            # Convert to numpy array to fix linter issues
+            onnx_output = np.array(onnx_output)
+            # Handle different output formats
+            if onnx_output.ndim == 2:
+                segment_codes = onnx_output[0, :].tolist()
+            elif onnx_output.ndim == 3:
+                segment_codes = onnx_output[0, 0, :].tolist()
+            else:
+                segment_codes = onnx_output.tolist()
+            segment_results.append(segment_codes)
+        except Exception as e:
+            print(f"  ONNX error on segment {i+1}: {str(e)[:100]}...")
+            raise Exception(
+                f"ONNX inference failed on segment {i+1}: {str(e)}")
+    if not segment_results:
+        raise ValueError("All ONNX segments failed to process")
+    # Merge segments using the same logic as _quantize_mixed_batch
+    # Determine token rate based on model name
+    if model_name == "speech_tokenizer_v1":
+        token_rate = 50
+    else:
+        token_rate = 25
+    merged_codes = s3tokenizer.merge_tokenized_segments(
+        segment_results, overlap=overlap, token_rate=token_rate
+    )[:-overlap * token_rate]  # NOTE(xcsong): drop the last overlap part.
+    return torch.tensor(merged_codes, dtype=torch.long)
+def onnx_inference_with_long_audio_support(
+        model_name: str, mel: torch.Tensor,
+        mel_len: torch.Tensor) -> torch.Tensor:
+    """
+    ONNX inference with automatic long audio support
+    """
+    max_frames = 3000  # 30s * 16000 / 160 = 3000 frames
+    if mel_len.item() <= max_frames:
+        # Short audio - use direct inference
+        return onnx_inference_short_audio(model_name, mel, mel_len)
+    else:
+        # Long audio - use sliding window approach
+        return onnx_inference_long_audio(model_name, mel, mel_len)
+def compare_torch_vs_onnx_single(model_name: str, audio: torch.Tensor,
+                                 audio_name: str) -> Dict[str, Any]:
+    """Test single audio with both torch and onnx versions"""
+    duration = audio.shape[0] / 16000
+    # Load torch model
+    tokenizer = s3tokenizer.load_model(model_name)
+    tokenizer.eval()
+    # Prepare input
+    mel = s3tokenizer.log_mel_spectrogram(audio)
+    mels = mel.unsqueeze(0)
+    mels_lens = torch.tensor([mel.size(1)])
+    # Test torch version
+    start_time = time.time()
+    with torch.no_grad():
+        torch_codes, torch_codes_lens = tokenizer.quantize(mels, mels_lens)
+    torch_time = time.time() - start_time
+    torch_result = torch_codes[0, :torch_codes_lens[0].item()]
+    # Test onnx version with long audio support
+    try:
+        start_time = time.time()
+        onnx_result = onnx_inference_with_long_audio_support(
+            model_name, mel, mels_lens[0])
+        onnx_time = time.time() - start_time
+        # Compare results
+        min_len = min(len(torch_result), len(onnx_result))
+        torch_truncated = torch_result[:min_len]
+        onnx_truncated = onnx_result[:min_len]
+        are_equal = torch.equal(torch_truncated, onnx_truncated)
+        miss_rate = 0.0
+        if not are_equal:
+            miss_num = torch.sum(~(torch_truncated == onnx_truncated))
+            miss_rate = miss_num.item() * 100.0 / min_len
+        return {
+            "audio_name": audio_name,
+            "model_name": model_name,
+            "duration": duration,
+            "torch_tokens": torch_truncated,
+            "onnx_tokens": onnx_truncated,
+            "torch_time": torch_time,
+            "onnx_time": onnx_time,
+            "results_match": are_equal,
+            "miss_rate": miss_rate
+        }
+    except Exception as e:
+        return {
+            "audio_name": audio_name,
+            "model_name": model_name,
+            "duration": duration,
+            "torch_tokens": torch_result,
+            "onnx_tokens": [],
+            "torch_time": torch_time,
+            "onnx_time": 0.0,
+            "results_match": False,
+            "miss_rate": 100.0,
+            "error": str(e)
+        }
+@pytest.mark.parametrize("model_name", [
+    "speech_tokenizer_v1", "speech_tokenizer_v1_25hz",
+    "speech_tokenizer_v2_25hz"
+])
+def test_torch_vs_onnx_short_audio(model_name, test_audio_suite):
+    """Test torch vs onnx for short audio (<=30s)"""
+    print(f"\n=== Testing {model_name} on Short Audio ===")
+    short_audios = {
+        k: v
+        for k, v in test_audio_suite.items() if v.shape[0] / 16000 <= 30
+    }
+    results = []
+    for audio_name, audio in short_audios.items():
+        result = compare_torch_vs_onnx_single(model_name, audio, audio_name)
+        results.append(result)
+        duration = result["duration"]
+        torch_tokens = result["torch_tokens"]
+        onnx_tokens = result["onnx_tokens"]
+        match_status = "✅" if result["results_match"] else "❌"
+        print(
+            f"{match_status} {audio_name}: {duration:.1f}s → torch:{len(torch_tokens)}, onnx:{len(onnx_tokens)}"
+        )
+        if not result["results_match"] and "error" not in result:
+            print(f"   Miss rate: {result['miss_rate']:.2f}%")
+            print(
+                f"   torch_tokens:\n{torch_tokens}\nonnx_tokens:\n{onnx_tokens}"
+            )
+    # Assertions
+    successful_tests = [r for r in results if "error" not in r]
+    assert len(successful_tests) == len(
+        short_audios
+    ), f"successful tests ({len(successful_tests)}) for {model_name} should be equal to number of short audios ({len(short_audios)})"  # noqa
+    # For short audio, we expect reasonable match rate
+    for r in results:
+        assert r[
+            'miss_rate'] < 0.5, f"Miss rate too high for {model_name}: {r['miss_rate']:.2f}%"
+    print(f"\n{model_name} Short Audio Summary:")
+    print(f"  Successful tests: {len(successful_tests)}/{len(results)}")
+@pytest.mark.parametrize("model_name", [
+    "speech_tokenizer_v1", "speech_tokenizer_v1_25hz",
+    "speech_tokenizer_v2_25hz"
+])
+def test_torch_vs_onnx_long_audio(model_name, test_audio_suite):
+    """Test torch vs onnx for long audio (>30s) with ONNX sliding window implementation"""
+    print(
+        f"\n=== Testing {model_name} on Long Audio (ONNX Sliding Window) ===")
+    long_audios = {
+        k: v
+        for k, v in test_audio_suite.items() if v.shape[0] / 16000 > 30
+    }
+    results = []
+    for audio_name, audio in long_audios.items():
+        result = compare_torch_vs_onnx_single(model_name, audio, audio_name)
+        results.append(result)
+        duration = result["duration"]
+        torch_tokens = result["torch_tokens"]
+        onnx_tokens = result["onnx_tokens"]
+        match_status = "✅" if result["results_match"] else "❌"
+        print(
+            f"{match_status} {audio_name}: {duration:.1f}s → torch:{len(torch_tokens)}, onnx:{len(onnx_tokens)}"
+        )
+        if not result["results_match"] and "error" not in result:
+            print(f"   Miss rate: {result['miss_rate']:.2f}%")
+            print(
+                f"   torch_tokens:\n{torch_tokens}\nonnx_tokens:\n{onnx_tokens}"
+            )
+        elif "error" in result:
+            print(f"   Error: {result['error'][:100]}...")
+    # For long audio with ONNX, we document the current limitations
+    successful_tests = [r for r in results if "error" not in r]
+    assert len(successful_tests) == len(
+        long_audios
+    ), f"successful tests ({len(successful_tests)}) for {model_name} should be equal to number of long audios ({len(long_audios)})"  # noqa
+    print(f"\n{model_name} Long Audio Results:")
+    print(f"  Total tests: {len(results)}")
+    print(f"  Successful ONNX tests: {len(successful_tests)}")
+    for r in results:
+        # NOTE(xcsong): 0.5% is a reasonable miss rate for long audio, since we drop the last overlap part.
+        assert r[
+            'miss_rate'] < 0.5, f"Miss rate too high for {model_name}: {r['miss_rate']}%"
+    # The main requirement is that Torch always works
+    print("  ✅ Torch processing works reliably for all long audio")
+if __name__ == "__main__":
+    # Run tests with pytest
+    pytest.main([__file__, "-v"])