niobures commited on Nov 6, 2025

Commit

bfc4d3c

verified ·

1 Parent(s): f4280c0

Step-Audio (code, dataset, demo, paper, tools)

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +3 -0
Step-Audio-AQAA. A Fully End-to-End Expressive Large Audio Language Model.pdf +3 -0
Step-Audio-EditX Technical Report.pdf +3 -0
Step-Audio. Unified Understanding and Generation in Intelligent Speech Interaction.pdf +3 -0
code/ComfyUI_StepAudioTTS.zip +3 -0
code/Step-Audio [intervitens].zip +3 -0
code/Step-Audio-EditX.zip +3 -0
code/Step-Audio-tts.zip +3 -0
code/Step-Audio.zip +3 -0
code/Step-Audio2.zip +3 -0
code/StepAudioInfer.zip +3 -0
code/astrbot_plugin_tts_Step_Audio.zip +3 -0
dataset/StepEval-Audio-360/.gitattributes +59 -0
dataset/StepEval-Audio-360/README.md +79 -0
dataset/StepEval-Audio-360/audios.tar.gz +3 -0
dataset/StepEval-Audio-360/data/test-00000-of-00001.parquet +3 -0
dataset/StepEval-Audio-360/source.txt +1 -0
demo/Step-Audio-EditX/.gitattributes +4 -0
demo/Step-Audio-EditX/.gitignore +2 -0
demo/Step-Audio-EditX/LICENSE +201 -0
demo/Step-Audio-EditX/README.md +13 -0
demo/Step-Audio-EditX/__init__.py +0 -0
demo/Step-Audio-EditX/app.py +505 -0
demo/Step-Audio-EditX/config/__init__.py +12 -0
demo/Step-Audio-EditX/config/edit_config.py +32 -0
demo/Step-Audio-EditX/config/prompts.py +23 -0
demo/Step-Audio-EditX/funasr_detach/__init__.py +38 -0
demo/Step-Audio-EditX/funasr_detach/auto/__init__.py +0 -0
demo/Step-Audio-EditX/funasr_detach/auto/auto_frontend.py +90 -0
demo/Step-Audio-EditX/funasr_detach/auto/auto_model.py +575 -0
demo/Step-Audio-EditX/funasr_detach/auto/auto_tokenizer.py +7 -0
demo/Step-Audio-EditX/funasr_detach/bin/__init__.py +0 -0
demo/Step-Audio-EditX/funasr_detach/bin/compute_audio_cmvn.py +152 -0
demo/Step-Audio-EditX/funasr_detach/bin/inference.py +33 -0
demo/Step-Audio-EditX/funasr_detach/bin/tokenize_text.py +281 -0
demo/Step-Audio-EditX/funasr_detach/bin/train.py +227 -0
demo/Step-Audio-EditX/funasr_detach/datasets/__init__.py +0 -0
demo/Step-Audio-EditX/funasr_detach/datasets/audio_datasets/__init__.py +0 -0
demo/Step-Audio-EditX/funasr_detach/datasets/audio_datasets/datasets.py +112 -0
demo/Step-Audio-EditX/funasr_detach/datasets/audio_datasets/index_ds.py +150 -0
demo/Step-Audio-EditX/funasr_detach/datasets/audio_datasets/preprocessor.py +55 -0
demo/Step-Audio-EditX/funasr_detach/datasets/audio_datasets/samplers.py +306 -0
demo/Step-Audio-EditX/funasr_detach/datasets/audio_datasets/scp2jsonl.py +116 -0
demo/Step-Audio-EditX/funasr_detach/download/__init__.py +0 -0
demo/Step-Audio-EditX/funasr_detach/download/download_dataset_from_hub.py +19 -0
demo/Step-Audio-EditX/funasr_detach/download/download_from_hub.py +231 -0
demo/Step-Audio-EditX/funasr_detach/download/file.py +335 -0
demo/Step-Audio-EditX/funasr_detach/download/name_maps_from_hub.py +13 -0
demo/Step-Audio-EditX/funasr_detach/download/runtime_sdk_download_tool.py +60 -0
demo/Step-Audio-EditX/funasr_detach/frontends/__init__.py +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+Step-Audio-AQAA.[[:space:]]A[[:space:]]Fully[[:space:]]End-to-End[[:space:]]Expressive[[:space:]]Large[[:space:]]Audio[[:space:]]Language[[:space:]]Model.pdf filter=lfs diff=lfs merge=lfs -text
+Step-Audio-EditX[[:space:]]Technical[[:space:]]Report.pdf filter=lfs diff=lfs merge=lfs -text
+Step-Audio.[[:space:]]Unified[[:space:]]Understanding[[:space:]]and[[:space:]]Generation[[:space:]]in[[:space:]]Intelligent[[:space:]]Speech[[:space:]]Interaction.pdf filter=lfs diff=lfs merge=lfs -text

Step-Audio-AQAA. A Fully End-to-End Expressive Large Audio Language Model.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4290ba946aaf9ebc8a1df00a905cbafb19f18ca3bcf9a38389716602ee5f7d7e
+size 1203894

Step-Audio-EditX Technical Report.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c1ff7493dcedd3e506b8de85860b0c608d06f0392245fb5385b7fa8231234e50
+size 786245

Step-Audio. Unified Understanding and Generation in Intelligent Speech Interaction.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ce5f6d5b9575f4552c970f118d3191ff49f3e509a847f0fff58c23aa7b510b3f
+size 6952309

code/ComfyUI_StepAudioTTS.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:17e36cf8812529f50c8b80b72d9111e20c476a2694365ad4f9049f019106b38b
+size 14201121

code/Step-Audio [intervitens].zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:88549705b04c2bbde00c6e5c67b8966c0aac195c1605a756abd893c53d690e00
+size 37467537

code/Step-Audio-EditX.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6376c6fe2c68201749f7dee3717eab495e5630d3611511e3afaa9b1fe265afcd
+size 5979796

code/Step-Audio-tts.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bba78bbc9039e5b3d7df4f77917119bbacc6d8aa9baf4d8114edfceda83fc624
+size 3827854

code/Step-Audio.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b4aaa4b50011e9c82ac51020de7177b43a07e5f47cbd7e8bb55e80929cac5d7a
+size 55625681

code/Step-Audio2.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:30f63a2dc8c598cc9c968c1a6cca2bc8150beeeec45b8b081843ac2580388dc9
+size 26895459

code/StepAudioInfer.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1bf64260061b9cfb68fc673770b78f561a783a889dc01a3346d5bb12c1f8bf25
+size 39121775

code/astrbot_plugin_tts_Step_Audio.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b6d563e44b30e27b1c51a05316e262ab6483db0cf6d42a43f5a5407ba9206380
+size 6151313

dataset/StepEval-Audio-360/.gitattributes ADDED Viewed

	@@ -0,0 +1,59 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.lz4 filter=lfs diff=lfs merge=lfs -text
+*.mds filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+# Audio files - uncompressed
+*.pcm filter=lfs diff=lfs merge=lfs -text
+*.sam filter=lfs diff=lfs merge=lfs -text
+*.raw filter=lfs diff=lfs merge=lfs -text
+# Audio files - compressed
+*.aac filter=lfs diff=lfs merge=lfs -text
+*.flac filter=lfs diff=lfs merge=lfs -text
+*.mp3 filter=lfs diff=lfs merge=lfs -text
+*.ogg filter=lfs diff=lfs merge=lfs -text
+*.wav filter=lfs diff=lfs merge=lfs -text
+# Image files - uncompressed
+*.bmp filter=lfs diff=lfs merge=lfs -text
+*.gif filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text
+*.tiff filter=lfs diff=lfs merge=lfs -text
+# Image files - compressed
+*.jpg filter=lfs diff=lfs merge=lfs -text
+*.jpeg filter=lfs diff=lfs merge=lfs -text
+*.webp filter=lfs diff=lfs merge=lfs -text
+# Video files - compressed
+*.mp4 filter=lfs diff=lfs merge=lfs -text
+*.webm filter=lfs diff=lfs merge=lfs -text

dataset/StepEval-Audio-360/README.md ADDED Viewed

	@@ -0,0 +1,79 @@

+---
+license: apache-2.0
+---
+# StepEval-Audio-360
+## Dataset Description
+StepEval Audio 360 is a comprehensive dataset that evaluates the ability of multi-modal large language models (MLLMs) in human-AI audio interaction. This audio benchmark dataset, sourced from professional human annotators, covers a full spectrum of capabilities: singing, creativity, role-playing, logical reasoning, voice understanding, voice instruction following, gaming, speech emotion control, and language ability.
+## Languages
+StepEval Audio 360 comprises about human voice recorded in different languages and dialects, including Chinese(Szechuan dialect and cantonese), English, and Japanese. It contains both audio and transcription data.
+## Links
+- Homepage: [Step-Audio](https://github.com/stepfun-ai/Step-Audio)
+- Paper: [Step-Audio: Unified Understanding and Generation in Intelligent Speech Interaction
+](https://arxiv.org/abs/2502.11946)
+- ModelScope: https://modelscope.cn/datasets/stepfun-ai/StepEval-Audio-360
+- Step-Audio Model Suite：
+  - Step-Audio-Tokenizer：
+    - Hugging Face：https://huggingface.co/stepfun-ai/Step-Audio-Tokenizer
+    - ModelScope：https://modelscope.cn/models/stepfun-ai/Step-Audio-Tokenizer
+  - Step-Audio-Chat :
+    - HuggingFace: https://huggingface.co/stepfun-ai/Step-Audio-Chat
+    - ModelScope: https://modelscope.cn/models/stepfun-ai/Step-Audio-Chat
+  - Step-Audio-TTS-3B：
+    - Hugging Face: https://huggingface.co/stepfun-ai/Step-Audio-TTS-3B
+    - ModelScope: https://modelscope.cn/models/stepfun-ai/Step-Audio-TTS-3B
+## User Manual
+* Download the dataset
+```
+# Make sure you have git-lfs installed (https://git-lfs.com)
+git lfs install
+git clone https://huggingface.co/datasets/stepfun-ai/StepEval-Audio-360
+cd StepEval-Audio-360
+git lfs pull
+```
+* Decompress audio data
+```
+mkdir audios
+tar -xvf audios.tar.gz -C audios
+```
+* How to use
+```
+from datasets import load_dataset
+dataset = load_dataset("stepfun-ai/StepEval-Audio-360")
+dataset = dataset["test"]
+for item in dataset:
+    conversation_id = item["conversation_id"]
+    category = item["category"]
+    conversation = item["conversation"]
+    # parse multi-turn dialogue data
+    for turn in conversation:
+        role = turn["role"]
+        text = turn["text"]
+        audio_filename = turn["audio_filename"] # refer to decompressed audio file
+        if audio_filename is not None:
+            print(role, text, audio_filename)
+        else:
+            print(role, text)
+```
+## Licensing
+This dataset project is licensed under the [Apache 2.0 License](https://www.apache.org/licenses/LICENSE-2.0).
+## Citation
+If you utilize this dataset, please cite it using the BibTeX provided.
+```
+@misc {stepfun_2025,
+        author       = { {StepFun} },
+        title        = { StepEval-Audio-360 (Revision 72a072e) },
+        year         = 2025,
+        url          = { https://huggingface.co/datasets/stepfun-ai/StepEval-Audio-360 },
+        doi          = { 10.57967/hf/4528 },
+        publisher    = { Hugging Face }
+}
+```

dataset/StepEval-Audio-360/audios.tar.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f7e9f043765500c6f6940ae58a55cf226ddfdde533099f4765bc40d2710d82d3
+size 166398432

dataset/StepEval-Audio-360/data/test-00000-of-00001.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d2990e77433b866431bbad8adc27b3aebee77046ceca5d265113994fedf2eaff
+size 69065

dataset/StepEval-Audio-360/source.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ https://huggingface.co/datasets/stepfun-ai/StepEval-Audio-360

demo/Step-Audio-EditX/.gitattributes ADDED Viewed

	@@ -0,0 +1,4 @@

+examples filter=lfs diff=lfs merge=lfs -text
+speakers/nezha_prompt.wav filter=lfs diff=lfs merge=lfs -text
+speakers/nezhaRAP_prompt.wav filter=lfs diff=lfs merge=lfs -text
+speakers/nezha哼唱_prompt.wav filter=lfs diff=lfs merge=lfs -text

demo/Step-Audio-EditX/.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ __pycache__/
2	+ output/

demo/Step-Audio-EditX/LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

demo/Step-Audio-EditX/README.md ADDED Viewed

	@@ -0,0 +1,13 @@

+---
+title: Step-Audio-EditX
+emoji: 🚀
+colorFrom: red
+colorTo: red
+sdk: gradio
+sdk_version: 5.49.1
+app_file: app.py
+pinned: true
+short_description: Try out Step-Audio-EditX
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

demo/Step-Audio-EditX/__init__.py ADDED Viewed

File without changes

demo/Step-Audio-EditX/app.py ADDED Viewed

	@@ -0,0 +1,505 @@

+import gradio as gr
+import os
+import argparse
+import torch
+import logging
+import threading
+from datetime import datetime
+import torchaudio
+import librosa
+import soundfile as sf
+# ZeroGPU support
+try:
+    import spaces
+    ZEROGPU_AVAILABLE = True
+except ImportError:
+    ZEROGPU_AVAILABLE = False
+    # Create a dummy decorator for non-ZeroGPU environments
+    class spaces:
+        @staticmethod
+        def GPU(duration=10):
+            def decorator(func):
+                return func
+            return decorator
+# Project imports
+from tokenizer import StepAudioTokenizer
+from tts import StepAudioTTS
+from model_loader import ModelSource
+from config.edit_config import get_supported_edit_types
+# Configure logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+# Global variables for ZeroGPU-optimized loading
+encoder = None
+common_tts_engine = None
+args_global = None
+_model_lock = threading.Lock()  # Thread lock for model initialization
+def initialize_models():
+    """Initialize models on first GPU call (ZeroGPU optimization: load inside GPU context)"""
+    global encoder, common_tts_engine, args_global
+    # Fast path: check if already initialized (without lock)
+    if common_tts_engine is not None:
+        return  # Already initialized
+    # Slow path: acquire lock and double-check
+    with _model_lock:
+        # Double-check pattern: another thread might have initialized while waiting for lock
+        if common_tts_engine is not None:
+            return  # Already initialized by another thread
+        if args_global is None:
+            raise RuntimeError("Global args not set. Cannot initialize models.")
+        try:
+            logger.info("🚀 Initializing models inside GPU context (first call)...")
+            # Determine model source
+            source_mapping = {
+                "auto": ModelSource.AUTO,
+                "local": ModelSource.LOCAL,
+                "modelscope": ModelSource.MODELSCOPE,
+                "huggingface": ModelSource.HUGGINGFACE
+            }
+            model_source = source_mapping[args_global.model_source]
+            # Load StepAudioTokenizer (avoid CUDA initialization in main process)
+            encoder = StepAudioTokenizer(
+                os.path.join(args_global.model_path, "Step-Audio-Tokenizer"),
+                model_source=model_source,
+                funasr_model_id=args_global.tokenizer_model_id
+            )
+            logger.info("✓ StepAudioTokenizer loaded")
+            # Initialize common TTS engine (avoid CUDA initialization in main process)
+            common_tts_engine = StepAudioTTS(
+                os.path.join(args_global.model_path, "Step-Audio-EditX"),
+                encoder,
+                model_source=model_source,
+                tts_model_id=args_global.tts_model_id
+            )
+            logger.info("✓ StepCommonAudioTTS loaded")
+            print("Models initialized inside GPU context.")
+            if ZEROGPU_AVAILABLE:
+                logger.info("💡 Models loaded inside GPU context - ready for inference")
+            else:
+                logger.info("💡 Models loaded - ready for inference")
+        except Exception as e:
+            logger.error(f"❌ Error loading models: {e}")
+            raise
+def get_model_config():
+    """Get model configuration without initializing GPU models"""
+    if args_global is None:
+        raise RuntimeError("Global args not set. Cannot get model config.")
+    return {
+        "encoder_path": os.path.join(args_global.model_path, "Step-Audio-Tokenizer"),
+        "tts_path": os.path.join(args_global.model_path, "Step-Audio-EditX"),
+        "model_source": args_global.model_source,
+        "tokenizer_model_id": args_global.tokenizer_model_id,
+        "tts_model_id": args_global.tts_model_id
+    }
+def get_gpu_duration(audio_input, text_input, target_text, task_type, task_info):
+    """Dynamic GPU duration based on whether models need initialization"""
+    global common_tts_engine
+    if common_tts_engine is None:
+        # First call - need time for model loading (up to 5 minutes)
+        return 300  # Maximum allowed duration for model initialization
+    else:
+        # Subsequent calls - only inference time needed
+        return 120  # Standard inference duration
+@spaces.GPU(duration=get_gpu_duration)  # Dynamic duration based on model state
+def process_audio_with_gpu(audio_input, text_input, target_text, task_type, task_info):
+    """Process audio using GPU (models are loaded inside GPU context to avoid main process errors)"""
+    global common_tts_engine
+    # Initialize models if not already loaded (inside GPU context to avoid main process errors)
+    if common_tts_engine is None:
+        print("Initializing common_tts_engine inside GPU context...")
+        logger.info("🎯 GPU allocated for 300s (first call with model loading)...")
+        initialize_models()
+        logger.info("✅ Models loaded successfully inside GPU context")
+    else:
+        print("common_tts_engine already initialized.")
+        logger.info("🎯 GPU allocated for 120s (inference with loaded models)...")
+    try:
+        # Use loaded models (first call may include loading time, subsequent calls are fast)
+        if task_type == "clone":
+            output_audio, sr = common_tts_engine.clone(audio_input, text_input, target_text)
+        else:
+            output_audio, sr = common_tts_engine.edit(audio_input, text_input, task_type, task_info, target_text)
+        logger.info("✅ Audio processing completed")
+        return output_audio, sr
+    except Exception as e:
+        logger.error(f"❌ Audio processing failed: {e}")
+        raise
+    # GPU automatically deallocated when function exits
+# Save audio to temporary directory
+def save_audio(audio_type, audio_data, sr, tmp_dir):
+    """Save audio data to a temporary file with timestamp"""
+    current_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+    save_path = os.path.join(tmp_dir, audio_type, f"{current_time}.wav")
+    os.makedirs(os.path.dirname(save_path), exist_ok=True)
+    try:
+        if isinstance(audio_data, torch.Tensor):
+            torchaudio.save(save_path, audio_data, sr)
+        else:
+            sf.write(save_path, audio_data, sr)
+        logger.debug(f"Audio saved to: {save_path}")
+        return save_path
+    except Exception as e:
+        logger.error(f"Failed to save audio: {e}")
+        raise
+class EditxTab:
+    """Audio editing and voice cloning interface tab"""
+    def __init__(self, args):
+        self.args = args
+        self.edit_type_list = list(get_supported_edit_types().keys())
+        self.logger = logging.getLogger(f"{__name__}.{self.__class__.__name__}")
+    def history_messages_to_show(self, messages):
+        """Convert message history to gradio chatbot format"""
+        show_msgs = []
+        for message in messages:
+            edit_type = message['edit_type']
+            edit_info = message['edit_info']
+            source_text = message['source_text']
+            target_text = message['target_text']
+            raw_audio_part = message['raw_wave']
+            edit_audio_part = message['edit_wave']
+            type_str = f"{edit_type}-{edit_info}" if edit_info is not None else f"{edit_type}"
+            show_msgs.extend([
+                {"role": "user", "content": f"任务类型：{type_str}\n文本：{source_text}"},
+                {"role": "user", "content": gr.Audio(value=raw_audio_part, interactive=False)},
+                {"role": "assistant", "content": f"输出音频：\n文本：{target_text}"},
+                {"role": "assistant", "content": gr.Audio(value=edit_audio_part, interactive=False)}
+            ])
+        return show_msgs
+    def generate_clone(self, prompt_text_input, prompt_audio_input, generated_text, edit_type, edit_info, state):
+        """Generate cloned audio (models are loaded on first GPU call)"""
+        self.logger.info("Starting voice cloning process")
+        state['history_audio'] = []
+        state['history_messages'] = []
+        # Input validation
+        if not prompt_text_input or prompt_text_input.strip() == "":
+            error_msg = "[Error] Uploaded text cannot be empty."
+            self.logger.error(error_msg)
+            return [{"role": "user", "content": error_msg}], state
+        if not prompt_audio_input:
+            error_msg = "[Error] Uploaded audio cannot be empty."
+            self.logger.error(error_msg)
+            return [{"role": "user", "content": error_msg}], state
+        if not generated_text or generated_text.strip() == "":
+            error_msg = "[Error] Clone content cannot be empty."
+            self.logger.error(error_msg)
+            return [{"role": "user", "content": error_msg}], state
+        if edit_type != "clone":
+            error_msg = "[Error] CLONE button must use clone task."
+            self.logger.error(error_msg)
+            return [{"role": "user", "content": error_msg}], state
+        try:
+            # Use GPU inference with models loaded inside GPU context
+            output_audio, output_sr = process_audio_with_gpu(
+                prompt_audio_input, prompt_text_input, generated_text, "clone", edit_info
+            )
+            if output_audio is not None and output_sr is not None:
+                # Convert tensor to numpy if needed
+                if isinstance(output_audio, torch.Tensor):
+                    audio_numpy = output_audio.cpu().numpy().squeeze()
+                else:
+                    audio_numpy = output_audio
+                # Load original audio for comparison
+                input_audio_data_numpy, input_sample_rate = librosa.load(prompt_audio_input)
+                # Create message for history
+                cur_assistant_msg = {
+                    "edit_type": edit_type,
+                    "edit_info": edit_info,
+                    "source_text": prompt_text_input,
+                    "target_text": generated_text,
+                    "raw_wave": (input_sample_rate, input_audio_data_numpy),
+                    "edit_wave": (output_sr, audio_numpy),
+                }
+                state["history_audio"].append((output_sr, audio_numpy, generated_text))
+                state["history_messages"].append(cur_assistant_msg)
+                show_msgs = self.history_messages_to_show(state["history_messages"])
+                self.logger.info("Voice cloning completed successfully")
+                return show_msgs, state
+            else:
+                error_msg = "[Error] Clone failed"
+                self.logger.error(error_msg)
+                return [{"role": "user", "content": error_msg}], state
+        except Exception as e:
+            error_msg = f"[Error] Clone failed: {str(e)}"
+            self.logger.error(error_msg)
+            return [{"role": "user", "content": error_msg}], state
+    def generate_edit(self, prompt_text_input, prompt_audio_input, generated_text, edit_type, edit_info, state):
+        """Generate edited audio (models are loaded on first GPU call)"""
+        self.logger.info("Starting audio editing process")
+        # Input validation
+        if not prompt_audio_input:
+            error_msg = "[Error] Uploaded audio cannot be empty."
+            self.logger.error(error_msg)
+            return [{"role": "user", "content": error_msg}], state
+        try:
+            # Determine which audio to use
+            if len(state["history_audio"]) == 0:
+                # First edit - use uploaded audio
+                audio_to_edit = prompt_audio_input
+                text_to_use = prompt_text_input
+                self.logger.debug("Using prompt audio, no history found")
+            else:
+                # Use previous edited audio - save it to temp file first
+                sample_rate, audio_numpy, previous_text = state["history_audio"][-1]
+                temp_path = save_audio("temp", audio_numpy, sample_rate, self.args.tmp_dir)
+                audio_to_edit = temp_path
+                text_to_use = previous_text
+                self.logger.debug(f"Using previous audio from history, count: {len(state['history_audio'])}")
+            # For para-linguistic, use generated_text; otherwise use source text
+            if edit_type not in {"paralinguistic"}:
+                generated_text = text_to_use
+            # Use GPU inference with models loaded inside GPU context
+            output_audio, output_sr = process_audio_with_gpu(
+                audio_to_edit, text_to_use, generated_text, edit_type, edit_info
+            )
+            if output_audio is not None and output_sr is not None:
+                # Convert tensor to numpy if needed
+                if isinstance(output_audio, torch.Tensor):
+                    audio_numpy = output_audio.cpu().numpy().squeeze()
+                else:
+                    audio_numpy = output_audio
+                # Load original audio for comparison
+                if len(state["history_audio"]) == 0:
+                    input_audio_data_numpy, input_sample_rate = librosa.load(prompt_audio_input)
+                else:
+                    input_sample_rate, input_audio_data_numpy, _ = state["history_audio"][-1]
+                # Create message for history
+                cur_assistant_msg = {
+                    "edit_type": edit_type,
+                    "edit_info": edit_info,
+                    "source_text": text_to_use,
+                    "target_text": generated_text,
+                    "raw_wave": (input_sample_rate, input_audio_data_numpy),
+                    "edit_wave": (output_sr, audio_numpy),
+                }
+                state["history_audio"].append((output_sr, audio_numpy, generated_text))
+                state["history_messages"].append(cur_assistant_msg)
+                show_msgs = self.history_messages_to_show(state["history_messages"])
+                self.logger.info("Audio editing completed successfully")
+                return show_msgs, state
+            else:
+                error_msg = "[Error] Edit failed"
+                self.logger.error(error_msg)
+                return [{"role": "user", "content": error_msg}], state
+        except Exception as e:
+            error_msg = f"[Error] Edit failed: {str(e)}"
+            self.logger.error(error_msg)
+            return [{"role": "user", "content": error_msg}], state
+    def clear_history(self, state):
+        """Clear conversation history"""
+        state["history_messages"] = []
+        state["history_audio"] = []
+        return [], state
+    def init_state(self):
+        """Initialize conversation state"""
+        return {
+            "history_messages": [],
+            "history_audio": []
+        }
+    def register_components(self):
+        """Register gradio components - maintaining exact layout from original"""
+        with gr.Tab("Editx"):
+            with gr.Row():
+                with gr.Column():
+                    self.model_input = gr.Textbox(label="Model Name", value="Step-Audio-EditX", scale=1)
+                    self.prompt_text_input = gr.Textbox(label="Prompt Text", value="", scale=1)
+                    self.prompt_audio_input = gr.Audio(
+                        sources=["upload", "microphone"],
+                        format="wav",
+                        type="filepath",
+                        label="Input Audio",
+                    )
+                    self.generated_text = gr.Textbox(label="Target Text", lines=1, max_lines=200, max_length=1000)
+                with gr.Column():
+                    with gr.Row():
+                        self.edit_type = gr.Dropdown(label="Task", choices=self.edit_type_list, value="clone")
+                        self.edit_info = gr.Dropdown(label="Sub-task", choices=[], value=None)
+                    self.chat_box = gr.Chatbot(label="History", type="messages", height=480*1)
+            with gr.Row():
+                with gr.Column():
+                    with gr.Row():
+                        self.button_tts = gr.Button("CLONE", variant="primary")
+                        self.button_edit = gr.Button("EDIT", variant="primary")
+                with gr.Column():
+                    self.clean_history_submit = gr.Button("Clear History", variant="primary")
+            gr.Markdown("---")
+            gr.Markdown("""
+                **Button Description:**
+                - CLONE: Synthesizes audio based on uploaded audio and text, only used for clone mode, will clear history information when used.
+                - EDIT: Edits based on uploaded audio, or continues to stack edit effects based on the previous round of generated audio.
+                """)
+            gr.Markdown("""
+                **Operation Workflow:**
+                - Upload the audio to be edited on the left side and fill in the corresponding text content of the audio;
+                - If the task requires modifying text content (such as clone, para-linguistic), fill in the text to be synthesized in the "clone text" field. For all other tasks, keep the uploaded audio text content unchanged;
+                - Select tasks and subtasks on the right side (some tasks have no subtasks, such as vad, etc.);
+                - Click the "CLONE" or "EDIT" button on the left side, and audio will be generated in the dialog box on the right side.
+                """)
+            gr.Markdown("""
+                **Para-linguistic Description:**
+                - Supported tags include: [Breathing] [Laughter] [Surprise-oh] [Confirmation-en] [Uhm] [Surprise-ah] [Surprise-wa] [Sigh] [Question-ei] [Dissatisfaction-hnn]
+                - Example:
+                    - Fill in "clone text" field: "Great, the weather is so nice today." Click the "CLONE" button to get audio.
+                    - Change "clone text" field to: "Great[Laughter], the weather is so nice today[Surprise-ah]." Click the "EDIT" button to get para-linguistic audio.
+                """)
+    def register_events(self):
+        """Register event handlers"""
+        # Create independent state for each session
+        state = gr.State(self.init_state())
+        self.button_tts.click(self.generate_clone,
+            inputs=[self.prompt_text_input, self.prompt_audio_input, self.generated_text, self.edit_type, self.edit_info, state],
+            outputs=[self.chat_box, state])
+        self.button_edit.click(self.generate_edit,
+            inputs=[self.prompt_text_input, self.prompt_audio_input, self.generated_text, self.edit_type, self.edit_info, state],
+            outputs=[self.chat_box, state])
+        self.clean_history_submit.click(self.clear_history, inputs=[state], outputs=[self.chat_box, state])
+        self.edit_type.change(
+            fn=self.update_edit_info,
+            inputs=self.edit_type,
+            outputs=self.edit_info,
+        )
+    def update_edit_info(self, category):
+        """Update sub-task dropdown based on main task selection"""
+        category_items = get_supported_edit_types()
+        choices = category_items.get(category, [])
+        value = None if len(choices) == 0 else choices[0]
+        return gr.Dropdown(label="Sub-task", choices=choices, value=value)
+def launch_demo(args, editx_tab):
+    """Launch the gradio demo"""
+    with gr.Blocks(
+            theme=gr.themes.Soft(),
+            title="🎙️ Step-Audio-EditX",
+            css="""
+    :root {
+        --font: "Helvetica Neue", Helvetica, Arial, sans-serif;
+        --font-mono: "SFMono-Regular", Consolas, "Liberation Mono", Menlo, monospace;
+    }
+    """) as demo:
+        gr.Markdown("## 🎙️ Step-Audio-EditX")
+        gr.Markdown("Audio Editing and Zero-Shot Cloning using Step-Audio-EditX")
+        # Register components
+        editx_tab.register_components()
+        # Register events
+        editx_tab.register_events()
+    # Launch demo
+    demo.queue().launch(
+        server_name=args.server_name,
+        server_port=args.server_port,
+        share=args.share if hasattr(args, 'share') else False
+    )
+if __name__ == "__main__":
+    # Parse command line arguments
+    parser = argparse.ArgumentParser(description="Step-Audio Edit Demo")
+    parser.add_argument("--model-path", type=str, default="stepfun-ai", help="Model path.")
+    parser.add_argument("--server-name", type=str, default="0.0.0.0", help="Demo server name.")
+    parser.add_argument("--server-port", type=int, default=7860, help="Demo server port.")
+    parser.add_argument("--tmp-dir", type=str, default="/tmp/gradio", help="Save path.")
+    parser.add_argument("--share", action="store_true", help="Share gradio app.")
+    # Multi-source loading support parameters
+    parser.add_argument(
+        "--model-source",
+        type=str,
+        default="huggingface",
+        choices=["auto", "local", "modelscope", "huggingface"],
+        help="Model source: auto (detect automatically), local, modelscope, or huggingface"
+    )
+    parser.add_argument(
+        "--tokenizer-model-id",
+        type=str,
+        default="dengcunqin/speech_paraformer-large_asr_nat-zh-cantonese-en-16k-vocab8501-online",
+        help="Tokenizer model ID for online loading"
+    )
+    parser.add_argument(
+        "--tts-model-id",
+        type=str,
+        default=None,
+        help="TTS model ID for online loading (if different from model-path)"
+    )
+    args = parser.parse_args()
+    # Store args globally for model configuration
+    args_global = args
+    logger.info(f"Configuration loaded:")
+    logger.info(f"Model source: {args.model_source}")
+    logger.info(f"Model path: {args.model_path}")
+    logger.info(f"Tokenizer model ID: {args.tokenizer_model_id}")
+    if args.tts_model_id:
+        logger.info(f"TTS model ID: {args.tts_model_id}")
+    # Models will be initialized on first GPU call to avoid ZeroGPU main process errors
+    if ZEROGPU_AVAILABLE:
+        logger.info("🎉 ZeroGPU detected - using dynamic GPU duration management!")
+        logger.info("💡 First call: 300s (model loading), subsequent calls: 120s (inference only)")
+    else:
+        logger.info("💻 Running in local mode - models will be loaded on first call")
+    # Create EditxTab instance
+    editx_tab = EditxTab(args)
+    # Launch demo
+    launch_demo(args, editx_tab)

demo/Step-Audio-EditX/config/__init__.py ADDED Viewed

	@@ -0,0 +1,12 @@

+"""
+Configuration module for Step-Audio
+"""
+from .prompts import AUDIO_EDIT_CLONE_SYSTEM_PROMPT_TPL, AUDIO_EDIT_SYSTEM_PROMPT
+from .edit_config import get_supported_edit_types
+__all__ = [
+    'AUDIO_EDIT_CLONE_SYSTEM_PROMPT_TPL',
+    'AUDIO_EDIT_SYSTEM_PROMPT',
+    'get_supported_edit_types'
+]

demo/Step-Audio-EditX/config/edit_config.py ADDED Viewed

	@@ -0,0 +1,32 @@

+"""
+音频编辑配置模块
+包含支持的编辑类型和相关配置
+"""
+def get_supported_edit_types():
+    """
+    获取支持的编辑类型和选项
+    Returns:
+        Dict[str, list]: Dictionary of edit types and their options
+    """
+    return {
+        "clone": [],
+        "emotion": [
+            'happy', 'angry', 'sad', 'humour', 'confusion', 'disgusted',
+            'empathy', 'embarrass', 'fear', 'surprised', 'excited',
+            'depressed', 'coldness', 'admiration', 'remove'
+        ],
+        "style": [
+            'serious', 'arrogant', 'child', 'older', 'girl', 'pure',
+            'sister', 'sweet', 'ethereal', 'whisper', 'gentle', 'recite',
+            'generous', 'act_coy', 'warm', 'shy', 'comfort', 'authority',
+            'chat', 'radio', 'soulful', 'story', 'vivid', 'program',
+            'news', 'advertising', 'roar', 'murmur', 'shout', 'deeply', 'loudly',
+            'remove', 'exaggerated'
+        ],
+        "vad": [],
+        "denoise": [],
+        "paralinguistic": [],
+        "speed": ["faster", "slower", "more faster", "more slower"],
+    }

demo/Step-Audio-EditX/config/prompts.py ADDED Viewed

	@@ -0,0 +1,23 @@

+"""
+系统提示配置模块
+包含所有TTS和编辑相关的系统提示
+"""
+AUDIO_EDIT_CLONE_SYSTEM_PROMPT_TPL = """Generate audio with the following timbre, prosody and speaking style
+[speaker_start]
+speaker name: {speaker}
+speaker prompt text:
+{prompt_text}
+speaker audio tokens:
+{prompt_wav_tokens}
+[speaker_end]
+"""
+AUDIO_EDIT_SYSTEM_PROMPT = """As a highly skilled audio editing and tuning specialist, you excel in interpreting user instructions and applying precise adjustments to meet their needs. Your expertise spans a wide range of enhancement capabilities, including but not limited to:
+# Emotional Enhancement
+# Speaking Style Transfer
+# Non-linguistic Adjustments
+# Audio Tuning & Editing
+Note: You will receive instructions in natural language and are expected to accurately interpret and execute the most suitable audio edits and enhancements.
+"""

demo/Step-Audio-EditX/funasr_detach/__init__.py ADDED Viewed

	@@ -0,0 +1,38 @@

+"""Initialize funasr package."""
+import os
+import pkgutil
+import importlib
+dirname = os.path.dirname(__file__)
+version_file = os.path.join(dirname, "version.txt")
+with open(version_file, "r") as f:
+    __version__ = f.read().strip()
+import importlib
+import pkgutil
+def import_submodules(package, recursive=True):
+    if isinstance(package, str):
+        package = importlib.import_module(package)
+    results = {}
+    for loader, name, is_pkg in pkgutil.walk_packages(
+        package.__path__, package.__name__ + "."
+    ):
+        try:
+            results[name] = importlib.import_module(name)
+        except Exception as e:
+            # 如果想要看到导入错误的具体信息，可以取消注释下面的行
+            # print(f"Failed to import {name}: {e}")
+            pass
+        if recursive and is_pkg:
+            results.update(import_submodules(name))
+    return results
+import_submodules(__name__)
+from funasr_detach.auto.auto_model import AutoModel
+from funasr_detach.auto.auto_frontend import AutoFrontend

demo/Step-Audio-EditX/funasr_detach/auto/__init__.py ADDED Viewed

File without changes

demo/Step-Audio-EditX/funasr_detach/auto/auto_frontend.py ADDED Viewed

	@@ -0,0 +1,90 @@

+import time
+import logging
+from tqdm import tqdm
+from funasr_detach.register import tables
+from funasr_detach.download.download_from_hub import download_model
+from funasr_detach.utils.load_utils import load_audio_text_image_video, extract_fbank
+from funasr_detach.auto.auto_model import prepare_data_iterator
+from funasr_detach.auto.auto_model import prepare_data_iterator
+class AutoFrontend:
+    def __init__(self, **kwargs):
+        assert "model" in kwargs
+        if "model_conf" not in kwargs:
+            logging.info(
+                "download models from model hub: {}".format(
+                    kwargs.get("model_hub", "ms")
+                )
+            )
+            kwargs = download_model(**kwargs)
+        # build frontend
+        frontend = kwargs.get("frontend", None)
+        if frontend is not None:
+            frontend_class = tables.frontend_classes.get(frontend)
+            frontend = frontend_class(**kwargs["frontend_conf"])
+        self.frontend = frontend
+        if "frontend" in kwargs:
+            del kwargs["frontend"]
+        self.kwargs = kwargs
+    def __call__(self, input, input_len=None, kwargs=None, **cfg):
+        kwargs = self.kwargs if kwargs is None else kwargs
+        kwargs.update(cfg)
+        key_list, data_list = prepare_data_iterator(input, input_len=input_len)
+        batch_size = kwargs.get("batch_size", 1)
+        device = kwargs.get("device", "cpu")
+        if device == "cpu":
+            batch_size = 1
+        meta_data = {}
+        result_list = []
+        num_samples = len(data_list)
+        pbar = tqdm(colour="blue", total=num_samples + 1, dynamic_ncols=True)
+        time0 = time.perf_counter()
+        for beg_idx in range(0, num_samples, batch_size):
+            end_idx = min(num_samples, beg_idx + batch_size)
+            data_batch = data_list[beg_idx:end_idx]
+            key_batch = key_list[beg_idx:end_idx]
+            # extract fbank feats
+            time1 = time.perf_counter()
+            audio_sample_list = load_audio_text_image_video(
+                data_batch, fs=self.frontend.fs, audio_fs=kwargs.get("fs", 16000)
+            )
+            time2 = time.perf_counter()
+            meta_data["load_data"] = f"{time2 - time1:0.3f}"
+            speech, speech_lengths = extract_fbank(
+                audio_sample_list,
+                data_type=kwargs.get("data_type", "sound"),
+                frontend=self.frontend,
+                **kwargs,
+            )
+            time3 = time.perf_counter()
+            meta_data["extract_feat"] = f"{time3 - time2:0.3f}"
+            meta_data["batch_data_time"] = (
+                speech_lengths.sum().item()
+                * self.frontend.frame_shift
+                * self.frontend.lfr_n
+                / 1000
+            )
+            speech.to(device=device), speech_lengths.to(device=device)
+            batch = {"input": speech, "input_len": speech_lengths, "key": key_batch}
+            result_list.append(batch)
+            pbar.update(1)
+            description = f"{meta_data}, "
+            pbar.set_description(description)
+        time_end = time.perf_counter()
+        pbar.set_description(f"time escaped total: {time_end - time0:0.3f}")
+        return result_list

demo/Step-Audio-EditX/funasr_detach/auto/auto_model.py ADDED Viewed

	@@ -0,0 +1,575 @@

+import json
+import time
+import copy
+import torch
+import random
+import string
+import logging
+import os.path
+import numpy as np
+from tqdm import tqdm
+from funasr_detach.register import tables
+from funasr_detach.utils.load_utils import load_bytes
+from funasr_detach.download.file import download_from_url
+from funasr_detach.download.download_from_hub import download_model
+from funasr_detach.utils.vad_utils import slice_padding_audio_samples
+from funasr_detach.train_utils.set_all_random_seed import set_all_random_seed
+from funasr_detach.train_utils.load_pretrained_model import load_pretrained_model
+from funasr_detach.utils.load_utils import load_audio_text_image_video
+from funasr_detach.utils.timestamp_tools import timestamp_sentence
+from funasr_detach.models.campplus.utils import sv_chunk, postprocess, distribute_spk
+try:
+    from funasr_detach.models.campplus.cluster_backend import ClusterBackend
+except:
+    print("If you want to use the speaker diarization, please `pip install hdbscan`")
+def prepare_data_iterator(data_in, input_len=None, data_type=None, key=None):
+    """
+    :param input:
+    :param input_len:
+    :param data_type:
+    :param frontend:
+    :return:
+    """
+    data_list = []
+    key_list = []
+    filelist = [".scp", ".txt", ".json", ".jsonl"]
+    chars = string.ascii_letters + string.digits
+    if isinstance(data_in, str) and data_in.startswith("http"):  # url
+        data_in = download_from_url(data_in)
+    if isinstance(data_in, str) and os.path.exists(
+        data_in
+    ):  # wav_path; filelist: wav.scp, file.jsonl;text.txt;
+        _, file_extension = os.path.splitext(data_in)
+        file_extension = file_extension.lower()
+        if file_extension in filelist:  # filelist: wav.scp, file.jsonl;text.txt;
+            with open(data_in, encoding="utf-8") as fin:
+                for line in fin:
+                    key = "rand_key_" + "".join(random.choice(chars) for _ in range(13))
+                    if data_in.endswith(
+                        ".jsonl"
+                    ):  # file.jsonl: json.dumps({"source": data})
+                        lines = json.loads(line.strip())
+                        data = lines["source"]
+                        key = data["key"] if "key" in data else key
+                    else:  # filelist, wav.scp, text.txt: id \t data or data
+                        lines = line.strip().split(maxsplit=1)
+                        data = lines[1] if len(lines) > 1 else lines[0]
+                        key = lines[0] if len(lines) > 1 else key
+                    data_list.append(data)
+                    key_list.append(key)
+        else:
+            key = "rand_key_" + "".join(random.choice(chars) for _ in range(13))
+            data_list = [data_in]
+            key_list = [key]
+    elif isinstance(data_in, (list, tuple)):
+        if data_type is not None and isinstance(
+            data_type, (list, tuple)
+        ):  # mutiple inputs
+            data_list_tmp = []
+            for data_in_i, data_type_i in zip(data_in, data_type):
+                key_list, data_list_i = prepare_data_iterator(
+                    data_in=data_in_i, data_type=data_type_i
+                )
+                data_list_tmp.append(data_list_i)
+            data_list = []
+            for item in zip(*data_list_tmp):
+                data_list.append(item)
+        else:
+            # [audio sample point, fbank, text]
+            data_list = data_in
+            key_list = [
+                "rand_key_" + "".join(random.choice(chars) for _ in range(13))
+                for _ in range(len(data_in))
+            ]
+    else:  # raw text; audio sample point, fbank; bytes
+        if isinstance(data_in, bytes):  # audio bytes
+            data_in = load_bytes(data_in)
+        if key is None:
+            key = "rand_key_" + "".join(random.choice(chars) for _ in range(13))
+        data_list = [data_in]
+        key_list = [key]
+    return key_list, data_list
+class AutoModel:
+    def __init__(self, **kwargs):
+        if not kwargs.get("disable_log", False):
+            tables.print()
+        model, kwargs = self.build_model(**kwargs)
+        # if vad_model is not None, build vad model else None
+        vad_model = kwargs.get("vad_model", None)
+        vad_kwargs = kwargs.get("vad_model_revision", None)
+        if vad_model is not None:
+            logging.info("Building VAD model.")
+            vad_kwargs = {
+                "model": vad_model,
+                "model_revision": vad_kwargs,
+                "device": kwargs["device"],
+            }
+            vad_model, vad_kwargs = self.build_model(**vad_kwargs)
+        # if punc_model is not None, build punc model else None
+        punc_model = kwargs.get("punc_model", None)
+        punc_kwargs = kwargs.get("punc_model_revision", None)
+        if punc_model is not None:
+            logging.info("Building punc model.")
+            punc_kwargs = {
+                "model": punc_model,
+                "model_revision": punc_kwargs,
+                "device": kwargs["device"],
+            }
+            punc_model, punc_kwargs = self.build_model(**punc_kwargs)
+        # if spk_model is not None, build spk model else None
+        spk_model = kwargs.get("spk_model", None)
+        spk_kwargs = kwargs.get("spk_model_revision", None)
+        if spk_model is not None:
+            logging.info("Building SPK model.")
+            spk_kwargs = {
+                "model": spk_model,
+                "model_revision": spk_kwargs,
+                "device": kwargs["device"],
+            }
+            spk_model, spk_kwargs = self.build_model(**spk_kwargs)
+            self.cb_model = ClusterBackend().to(kwargs["device"])
+            spk_mode = kwargs.get("spk_mode", "punc_segment")
+            if spk_mode not in ["default", "vad_segment", "punc_segment"]:
+                logging.error(
+                    "spk_mode should be one of default, vad_segment and punc_segment."
+                )
+            self.spk_mode = spk_mode
+        self.kwargs = kwargs
+        self.model = model
+        self.vad_model = vad_model
+        self.vad_kwargs = vad_kwargs
+        self.punc_model = punc_model
+        self.punc_kwargs = punc_kwargs
+        self.spk_model = spk_model
+        self.spk_kwargs = spk_kwargs
+        self.model_path = kwargs.get("model_path")
+        self.repo_path = kwargs.get("repo_path")
+    def build_model(self, **kwargs):
+        assert "model" in kwargs
+        if "model_conf" not in kwargs:
+            logging.info(
+                "download models from model hub: {}".format(
+                    kwargs.get("model_hub", "ms")
+                )
+            )
+            kwargs = download_model(**kwargs)
+        set_all_random_seed(kwargs.get("seed", 0))
+        device = kwargs.get("device", "cuda")
+        if not torch.cuda.is_available() or kwargs.get("ngpu", 1) == 0:
+            device = "cpu"
+            kwargs["batch_size"] = 1
+        kwargs["device"] = device
+        if kwargs.get("ncpu", None):
+            torch.set_num_threads(kwargs.get("ncpu"))
+        # build tokenizer
+        tokenizer = kwargs.get("tokenizer", None)
+        if tokenizer is not None:
+            tokenizer_class = tables.tokenizer_classes.get(tokenizer)
+            tokenizer = tokenizer_class(**kwargs["tokenizer_conf"])
+            kwargs["tokenizer"] = tokenizer
+            kwargs["token_list"] = tokenizer.token_list
+            vocab_size = len(tokenizer.token_list)
+        else:
+            vocab_size = -1
+        # build frontend
+        frontend = kwargs.get("frontend", None)
+        if frontend is not None:
+            frontend_class = tables.frontend_classes.get(frontend)
+            frontend = frontend_class(**kwargs["frontend_conf"])
+            kwargs["frontend"] = frontend
+            kwargs["input_size"] = frontend.output_size()
+        # build model
+        model_class = tables.model_classes.get(kwargs["model"])
+        model = model_class(**kwargs, **kwargs["model_conf"], vocab_size=vocab_size)
+        model.to(device)
+        # init_param
+        init_param = kwargs.get("init_param", None)
+        if init_param is not None:
+            logging.info(f"Loading pretrained params from {init_param}")
+            load_pretrained_model(
+                model=model,
+                path=init_param,
+                ignore_init_mismatch=kwargs.get("ignore_init_mismatch", False),
+                oss_bucket=kwargs.get("oss_bucket", None),
+                scope_map=kwargs.get("scope_map", None),
+                excludes=kwargs.get("excludes", None),
+            )
+        return model, kwargs
+    def __call__(self, *args, **cfg):
+        kwargs = self.kwargs
+        kwargs.update(cfg)
+        res = self.model(*args, kwargs)
+        return res
+    def generate(self, input, input_len=None, **cfg):
+        if self.vad_model is None:
+            return self.inference(input, input_len=input_len, **cfg)
+        else:
+            return self.inference_with_vad(input, input_len=input_len, **cfg)
+    def inference(
+        self, input, input_len=None, model=None, kwargs=None, key=None, **cfg
+    ):
+        kwargs = self.kwargs if kwargs is None else kwargs
+        kwargs.update(cfg)
+        model = self.model if model is None else model
+        model = model.cuda()
+        model.eval()
+        batch_size = kwargs.get("batch_size", 1)
+        # if kwargs.get("device", "cpu") == "cpu":
+        #     batch_size = 1
+        key_list, data_list = prepare_data_iterator(
+            input, input_len=input_len, data_type=kwargs.get("data_type", None), key=key
+        )
+        speed_stats = {}
+        asr_result_list = []
+        num_samples = len(data_list)
+        disable_pbar = kwargs.get("disable_pbar", False)
+        pbar = (
+            tqdm(colour="blue", total=num_samples, dynamic_ncols=True)
+            if not disable_pbar
+            else None
+        )
+        time_speech_total = 0.0
+        time_escape_total = 0.0
+        for beg_idx in range(0, num_samples, batch_size):
+            end_idx = min(num_samples, beg_idx + batch_size)
+            data_batch = data_list[beg_idx:end_idx]
+            key_batch = key_list[beg_idx:end_idx]
+            batch = {"data_in": data_batch, "key": key_batch}
+            if (end_idx - beg_idx) == 1 and kwargs.get(
+                "data_type", None
+            ) == "fbank":  # fbank
+                batch["data_in"] = data_batch[0]
+                batch["data_lengths"] = input_len
+            time1 = time.perf_counter()
+            with torch.no_grad():
+                results, meta_data = model.inference(**batch, **kwargs)
+            time2 = time.perf_counter()
+            asr_result_list.extend(results)
+            # batch_data_time = time_per_frame_s * data_batch_i["speech_lengths"].sum().item()
+            batch_data_time = meta_data.get("batch_data_time", -1)
+            time_escape = time2 - time1
+            speed_stats["load_data"] = meta_data.get("load_data", 0.0)
+            speed_stats["extract_feat"] = meta_data.get("extract_feat", 0.0)
+            speed_stats["forward"] = f"{time_escape:0.3f}"
+            speed_stats["batch_size"] = f"{len(results)}"
+            speed_stats["time_cost"] = f"{(time_escape)}"
+            speed_stats["rtf"] = f"{(time_escape) / batch_data_time:0.3f}"
+            description = f"{speed_stats}, "
+            if pbar:
+                pbar.update(1)
+                pbar.set_description(description)
+            time_speech_total += batch_data_time
+            time_escape_total += time_escape
+        if pbar:
+            # pbar.update(1)
+            pbar.set_description(f"rtf_avg: {time_escape_total/time_speech_total:0.3f}")
+        torch.cuda.empty_cache()
+        return asr_result_list
+    def inference_with_vad(self, input, input_len=None, **cfg):
+        # step.1: compute the vad model
+        self.vad_kwargs.update(cfg)
+        beg_vad = time.time()
+        res = self.inference(
+            input,
+            input_len=input_len,
+            model=self.vad_model,
+            kwargs=self.vad_kwargs,
+            **cfg,
+        )
+        end_vad = time.time()
+        print(f"time cost vad: {end_vad - beg_vad:0.3f}")
+        # step.2 compute asr model
+        model = self.model
+        kwargs = self.kwargs
+        kwargs.update(cfg)
+        batch_size = int(kwargs.get("batch_size_s", 300)) * 1000
+        batch_size_threshold_ms = int(kwargs.get("batch_size_threshold_s", 60)) * 1000
+        kwargs["batch_size"] = batch_size
+        key_list, data_list = prepare_data_iterator(
+            input, input_len=input_len, data_type=kwargs.get("data_type", None)
+        )
+        results_ret_list = []
+        time_speech_total_all_samples = 1e-6
+        beg_total = time.time()
+        pbar_total = tqdm(colour="red", total=len(res), dynamic_ncols=True)
+        for i in range(len(res)):
+            key = res[i]["key"]
+            vadsegments = res[i]["value"]
+            input_i = data_list[i]
+            speech = load_audio_text_image_video(
+                input_i, fs=kwargs["frontend"].fs, audio_fs=kwargs.get("fs", 16000)
+            )
+            speech_lengths = len(speech)
+            n = len(vadsegments)
+            data_with_index = [(vadsegments[i], i) for i in range(n)]
+            sorted_data = sorted(data_with_index, key=lambda x: x[0][1] - x[0][0])
+            results_sorted = []
+            if not len(sorted_data):
+                logging.info("decoding, utt: {}, empty speech".format(key))
+                continue
+            if len(sorted_data) > 0 and len(sorted_data[0]) > 0:
+                batch_size = max(
+                    batch_size, sorted_data[0][0][1] - sorted_data[0][0][0]
+                )
+            batch_size_ms_cum = 0
+            beg_idx = 0
+            beg_asr_total = time.time()
+            time_speech_total_per_sample = speech_lengths / 16000
+            time_speech_total_all_samples += time_speech_total_per_sample
+            all_segments = []
+            for j, _ in enumerate(range(0, n)):
+                # pbar_sample.update(1)
+                batch_size_ms_cum += sorted_data[j][0][1] - sorted_data[j][0][0]
+                if (
+                    j < n - 1
+                    and (
+                        batch_size_ms_cum
+                        + sorted_data[j + 1][0][1]
+                        - sorted_data[j + 1][0][0]
+                    )
+                    < batch_size
+                    and (sorted_data[j + 1][0][1] - sorted_data[j + 1][0][0])
+                    < batch_size_threshold_ms
+                ):
+                    continue
+                batch_size_ms_cum = 0
+                end_idx = j + 1
+                speech_j, speech_lengths_j = slice_padding_audio_samples(
+                    speech, speech_lengths, sorted_data[beg_idx:end_idx]
+                )
+                results = self.inference(
+                    speech_j,
+                    input_len=None,
+                    model=model,
+                    kwargs=kwargs,
+                    disable_pbar=True,
+                    **cfg,
+                )
+                if self.spk_model is not None:
+                    # compose vad segments: [[start_time_sec, end_time_sec, speech], [...]]
+                    for _b in range(len(speech_j)):
+                        vad_segments = [
+                            [
+                                sorted_data[beg_idx:end_idx][_b][0][0] / 1000.0,
+                                sorted_data[beg_idx:end_idx][_b][0][1] / 1000.0,
+                                np.array(speech_j[_b]),
+                            ]
+                        ]
+                        segments = sv_chunk(vad_segments)
+                        all_segments.extend(segments)
+                        speech_b = [i[2] for i in segments]
+                        spk_res = self.inference(
+                            speech_b,
+                            input_len=None,
+                            model=self.spk_model,
+                            kwargs=kwargs,
+                            disable_pbar=True,
+                            **cfg,
+                        )
+                        results[_b]["spk_embedding"] = spk_res[0]["spk_embedding"]
+                beg_idx = end_idx
+                if len(results) < 1:
+                    continue
+                results_sorted.extend(results)
+            restored_data = [0] * n
+            for j in range(n):
+                index = sorted_data[j][1]
+                restored_data[index] = results_sorted[j]
+            result = {}
+            # results combine for texts, timestamps, speaker embeddings and others
+            # TODO: rewrite for clean code
+            for j in range(n):
+                for k, v in restored_data[j].items():
+                    if k.startswith("timestamp"):
+                        if k not in result:
+                            result[k] = []
+                        for t in restored_data[j][k]:
+                            t[0] += vadsegments[j][0]
+                            t[1] += vadsegments[j][0]
+                        result[k].extend(restored_data[j][k])
+                    elif k == "spk_embedding":
+                        if k not in result:
+                            result[k] = restored_data[j][k]
+                        else:
+                            result[k] = torch.cat(
+                                [result[k], restored_data[j][k]], dim=0
+                            )
+                    elif "text" in k:
+                        if k not in result:
+                            result[k] = restored_data[j][k]
+                        else:
+                            result[k] += " " + restored_data[j][k]
+                    else:
+                        if k not in result:
+                            result[k] = restored_data[j][k]
+                        else:
+                            result[k] += restored_data[j][k]
+            return_raw_text = kwargs.get("return_raw_text", False)
+            # step.3 compute punc model
+            if self.punc_model is not None:
+                self.punc_kwargs.update(cfg)
+                punc_res = self.inference(
+                    result["text"],
+                    model=self.punc_model,
+                    kwargs=self.punc_kwargs,
+                    disable_pbar=True,
+                    **cfg,
+                )
+                raw_text = copy.copy(result["text"])
+                if return_raw_text:
+                    result["raw_text"] = raw_text
+                result["text"] = punc_res[0]["text"]
+            else:
+                raw_text = None
+            # speaker embedding cluster after resorted
+            if self.spk_model is not None and kwargs.get("return_spk_res", True):
+                if raw_text is None:
+                    logging.error("Missing punc_model, which is required by spk_model.")
+                all_segments = sorted(all_segments, key=lambda x: x[0])
+                spk_embedding = result["spk_embedding"]
+                labels = self.cb_model(
+                    spk_embedding.cpu(), oracle_num=kwargs.get("preset_spk_num", None)
+                )
+                # del result['spk_embedding']
+                sv_output = postprocess(all_segments, None, labels, spk_embedding.cpu())
+                if self.spk_mode == "vad_segment":  # recover sentence_list
+                    sentence_list = []
+                    for res, vadsegment in zip(restored_data, vadsegments):
+                        if "timestamp" not in res:
+                            logging.error(
+                                "Only 'iic/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch' \
+                                           and 'iic/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch'\
+                                           can predict timestamp, and speaker diarization relies on timestamps."
+                            )
+                        sentence_list.append(
+                            {
+                                "start": vadsegment[0],
+                                "end": vadsegment[1],
+                                "sentence": res["text"],
+                                "timestamp": res["timestamp"],
+                            }
+                        )
+                elif self.spk_mode == "punc_segment":
+                    if "timestamp" not in result:
+                        logging.error(
+                            "Only 'iic/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch' \
+                                       and 'iic/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch'\
+                                       can predict timestamp, and speaker diarization relies on timestamps."
+                        )
+                    sentence_list = timestamp_sentence(
+                        punc_res[0]["punc_array"],
+                        result["timestamp"],
+                        raw_text,
+                        return_raw_text=return_raw_text,
+                    )
+                distribute_spk(sentence_list, sv_output)
+                result["sentence_info"] = sentence_list
+            elif kwargs.get("sentence_timestamp", False):
+                sentence_list = timestamp_sentence(
+                    punc_res[0]["punc_array"],
+                    result["timestamp"],
+                    raw_text,
+                    return_raw_text=return_raw_text,
+                )
+                result["sentence_info"] = sentence_list
+            if "spk_embedding" in result:
+                del result["spk_embedding"]
+            result["key"] = key
+            results_ret_list.append(result)
+            end_asr_total = time.time()
+            time_escape_total_per_sample = end_asr_total - beg_asr_total
+            pbar_total.update(1)
+            pbar_total.set_description(
+                f"rtf_avg: {time_escape_total_per_sample / time_speech_total_per_sample:0.3f}, "
+                f"time_speech: {time_speech_total_per_sample: 0.3f}, "
+                f"time_escape: {time_escape_total_per_sample:0.3f}"
+            )
+        return results_ret_list
+    def infer_encoder(
+        self, input, input_len=None, model=None, kwargs=None, key=None, **cfg
+    ):
+        kwargs = self.kwargs if kwargs is None else kwargs
+        kwargs.update(cfg)
+        model = self.model if model is None else model
+        model = model.cuda()
+        model.eval()
+        batch_size = kwargs.get("batch_size", 1)
+        key_list, data_list = prepare_data_iterator(
+            input, input_len=input_len, data_type=kwargs.get("data_type", None), key=key
+        )
+        asr_result_list = []
+        num_samples = len(data_list)
+        for beg_idx in range(0, num_samples, batch_size):
+            end_idx = min(num_samples, beg_idx + batch_size)
+            data_batch = data_list[beg_idx:end_idx]
+            key_batch = key_list[beg_idx:end_idx]
+            batch = {"data_in": data_batch, "key": key_batch}
+            if (end_idx - beg_idx) == 1 and kwargs.get(
+                "data_type", None
+            ) == "fbank":  # fbank
+                batch["data_in"] = data_batch[0]
+                batch["data_lengths"] = input_len
+            with torch.no_grad():
+                results, meta_data, cache = model.infer_encoder(**batch, **kwargs)
+            asr_result_list.extend(results)
+        torch.cuda.empty_cache()
+        return asr_result_list, cache

demo/Step-Audio-EditX/funasr_detach/auto/auto_tokenizer.py ADDED Viewed

	@@ -0,0 +1,7 @@

+class AutoTokenizer:
+    """
+    Undo
+    """
+    def __init__(self):
+        pass

demo/Step-Audio-EditX/funasr_detach/bin/__init__.py ADDED Viewed

File without changes

demo/Step-Audio-EditX/funasr_detach/bin/compute_audio_cmvn.py ADDED Viewed

	@@ -0,0 +1,152 @@

+import os
+import json
+import numpy as np
+import torch
+import hydra
+import logging
+from omegaconf import DictConfig, OmegaConf
+from funasr_detach.register import tables
+from funasr_detach.download.download_from_hub import download_model
+from funasr_detach.train_utils.set_all_random_seed import set_all_random_seed
+@hydra.main(config_name=None, version_base=None)
+def main_hydra(kwargs: DictConfig):
+    if kwargs.get("debug", False):
+        import pdb
+        pdb.set_trace()
+    assert "model" in kwargs
+    if "model_conf" not in kwargs:
+        logging.info(
+            "download models from model hub: {}".format(kwargs.get("model_hub", "ms"))
+        )
+        kwargs = download_model(is_training=kwargs.get("is_training", True), **kwargs)
+    main(**kwargs)
+def main(**kwargs):
+    print(kwargs)
+    # set random seed
+    tables.print()
+    set_all_random_seed(kwargs.get("seed", 0))
+    torch.backends.cudnn.enabled = kwargs.get(
+        "cudnn_enabled", torch.backends.cudnn.enabled
+    )
+    torch.backends.cudnn.benchmark = kwargs.get(
+        "cudnn_benchmark", torch.backends.cudnn.benchmark
+    )
+    torch.backends.cudnn.deterministic = kwargs.get("cudnn_deterministic", True)
+    tokenizer = kwargs.get("tokenizer", None)
+    # build frontend if frontend is none None
+    frontend = kwargs.get("frontend", None)
+    if frontend is not None:
+        frontend_class = tables.frontend_classes.get(frontend)
+        frontend = frontend_class(**kwargs["frontend_conf"])
+        kwargs["frontend"] = frontend
+        kwargs["input_size"] = frontend.output_size()
+    # dataset
+    dataset_class = tables.dataset_classes.get(kwargs.get("dataset", "AudioDataset"))
+    dataset_train = dataset_class(
+        kwargs.get("train_data_set_list"),
+        frontend=frontend,
+        tokenizer=None,
+        is_training=False,
+        **kwargs.get("dataset_conf")
+    )
+    # dataloader
+    batch_sampler = kwargs["dataset_conf"].get(
+        "batch_sampler", "DynamicBatchLocalShuffleSampler"
+    )
+    batch_sampler_train = None
+    if batch_sampler is not None:
+        batch_sampler_class = tables.batch_sampler_classes.get(batch_sampler)
+        dataset_conf = kwargs.get("dataset_conf")
+        dataset_conf["batch_type"] = "example"
+        dataset_conf["batch_size"] = 1
+        batch_sampler_train = batch_sampler_class(
+            dataset_train, is_training=False, **dataset_conf
+        )
+    dataloader_train = torch.utils.data.DataLoader(
+        dataset_train,
+        collate_fn=dataset_train.collator,
+        batch_sampler=batch_sampler_train,
+        num_workers=int(kwargs.get("dataset_conf").get("num_workers", 4)),
+        pin_memory=True,
+    )
+    iter_stop = int(kwargs.get("scale", 1.0) * len(dataloader_train))
+    total_frames = 0
+    for batch_idx, batch in enumerate(dataloader_train):
+        if batch_idx >= iter_stop:
+            break
+        fbank = batch["speech"].numpy()[0, :, :]
+        if total_frames == 0:
+            mean_stats = np.sum(fbank, axis=0)
+            var_stats = np.sum(np.square(fbank), axis=0)
+        else:
+            mean_stats += np.sum(fbank, axis=0)
+            var_stats += np.sum(np.square(fbank), axis=0)
+        total_frames += fbank.shape[0]
+    cmvn_info = {
+        "mean_stats": list(mean_stats.tolist()),
+        "var_stats": list(var_stats.tolist()),
+        "total_frames": total_frames,
+    }
+    cmvn_file = kwargs.get("cmvn_file", "cmvn.json")
+    # import pdb;pdb.set_trace()
+    with open(cmvn_file, "w") as fout:
+        fout.write(json.dumps(cmvn_info))
+    mean = -1.0 * mean_stats / total_frames
+    var = 1.0 / np.sqrt(var_stats / total_frames - mean * mean)
+    dims = mean.shape[0]
+    am_mvn = os.path.dirname(cmvn_file) + "/am.mvn"
+    with open(am_mvn, "w") as fout:
+        fout.write(
+            "<Nnet>"
+            + "\n"
+            + "<Splice> "
+            + str(dims)
+            + " "
+            + str(dims)
+            + "\n"
+            + "[ 0 ]"
+            + "\n"
+            + "<AddShift> "
+            + str(dims)
+            + " "
+            + str(dims)
+            + "\n"
+        )
+        mean_str = (
+            str(list(mean)).replace(",", "").replace("[", "[ ").replace("]", " ]")
+        )
+        fout.write("<LearnRateCoef> 0 " + mean_str + "\n")
+        fout.write("<Rescale> " + str(dims) + " " + str(dims) + "\n")
+        var_str = str(list(var)).replace(",", "").replace("[", "[ ").replace("]", " ]")
+        fout.write("<LearnRateCoef> 0 " + var_str + "\n")
+        fout.write("</Nnet>" + "\n")
+"""
+python funasr/bin/compute_audio_cmvn.py \
+--config-path "/Users/zhifu/funasr1.0/examples/aishell/paraformer/conf" \
+--config-name "train_asr_paraformer_conformer_12e_6d_2048_256.yaml" \
+++train_data_set_list="/Users/zhifu/funasr1.0/data/list/audio_datasets.jsonl" \
+++cmvn_file="/Users/zhifu/funasr1.0/data/list/cmvn.json" \
+++dataset_conf.num_workers=0
+"""
+if __name__ == "__main__":
+    main_hydra()

demo/Step-Audio-EditX/funasr_detach/bin/inference.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import hydra
+import logging
+from omegaconf import DictConfig, OmegaConf, ListConfig
+from funasr_detach.auto.auto_model import AutoModel
+@hydra.main(config_name=None, version_base=None)
+def main_hydra(cfg: DictConfig):
+    def to_plain_list(cfg_item):
+        if isinstance(cfg_item, ListConfig):
+            return OmegaConf.to_container(cfg_item, resolve=True)
+        elif isinstance(cfg_item, DictConfig):
+            return {k: to_plain_list(v) for k, v in cfg_item.items()}
+        else:
+            return cfg_item
+    kwargs = to_plain_list(cfg)
+    log_level = getattr(logging, kwargs.get("log_level", "INFO").upper())
+    logging.basicConfig(level=log_level)
+    if kwargs.get("debug", False):
+        import pdb
+        pdb.set_trace()
+    model = AutoModel(**kwargs)
+    res = model.generate(input=kwargs["input"])
+    print(res)
+if __name__ == "__main__":
+    main_hydra()

demo/Step-Audio-EditX/funasr_detach/bin/tokenize_text.py ADDED Viewed

	@@ -0,0 +1,281 @@

+#!/usr/bin/env python3
+import argparse
+from collections import Counter
+import logging
+from pathlib import Path
+import sys
+from typing import List
+from typing import Optional
+from funasr_detach.utils.cli_utils import get_commandline_args
+from funasr_detach.tokenizer.build_tokenizer import build_tokenizer
+from funasr_detach.tokenizer.cleaner import TextCleaner
+from funasr_detach.tokenizer.phoneme_tokenizer import g2p_classes
+from funasr_detach.utils.types import str2bool
+from funasr_detach.utils.types import str_or_none
+def field2slice(field: Optional[str]) -> slice:
+    """Convert field string to slice
+    Note that field string accepts 1-based integer.
+    Examples:
+        >>> field2slice("1-")
+        slice(0, None, None)
+        >>> field2slice("1-3")
+        slice(0, 3, None)
+        >>> field2slice("-3")
+        slice(None, 3, None)
+    """
+    field = field.strip()
+    try:
+        if "-" in field:
+            # e.g. "2-" or "2-5" or "-7"
+            s1, s2 = field.split("-", maxsplit=1)
+            if s1.strip() == "":
+                s1 = None
+            else:
+                s1 = int(s1)
+                if s1 == 0:
+                    raise ValueError("1-based string")
+            if s2.strip() == "":
+                s2 = None
+            else:
+                s2 = int(s2)
+        else:
+            # e.g. "2"
+            s1 = int(field)
+            s2 = s1 + 1
+            if s1 == 0:
+                raise ValueError("must be 1 or more value")
+    except ValueError:
+        raise RuntimeError(f"Format error: e.g. '2-', '2-5', or '-5': {field}")
+    if s1 is None:
+        slic = slice(None, s2)
+    else:
+        # -1 because of 1-based integer following "cut" command
+        # e.g "1-3" -> slice(0, 3)
+        slic = slice(s1 - 1, s2)
+    return slic
+def tokenize(
+    input: str,
+    output: str,
+    field: Optional[str],
+    delimiter: Optional[str],
+    token_type: str,
+    space_symbol: str,
+    non_linguistic_symbols: Optional[str],
+    bpemodel: Optional[str],
+    log_level: str,
+    write_vocabulary: bool,
+    vocabulary_size: int,
+    remove_non_linguistic_symbols: bool,
+    cutoff: int,
+    add_symbol: List[str],
+    cleaner: Optional[str],
+    g2p: Optional[str],
+):
+    logging.basicConfig(
+        level=log_level,
+        format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+    )
+    if input == "-":
+        fin = sys.stdin
+    else:
+        fin = Path(input).open("r", encoding="utf-8")
+    if output == "-":
+        fout = sys.stdout
+    else:
+        p = Path(output)
+        p.parent.mkdir(parents=True, exist_ok=True)
+        fout = p.open("w", encoding="utf-8")
+    cleaner = TextCleaner(cleaner)
+    tokenizer = build_tokenizer(
+        token_type=token_type,
+        bpemodel=bpemodel,
+        delimiter=delimiter,
+        space_symbol=space_symbol,
+        non_linguistic_symbols=non_linguistic_symbols,
+        remove_non_linguistic_symbols=remove_non_linguistic_symbols,
+        g2p_type=g2p,
+    )
+    counter = Counter()
+    if field is not None:
+        field = field2slice(field)
+    for line in fin:
+        line = line.rstrip()
+        if field is not None:
+            # e.g. field="2-"
+            # uttidA hello world!! -> hello world!!
+            tokens = line.split(delimiter)
+            tokens = tokens[field]
+            if delimiter is None:
+                line = " ".join(tokens)
+            else:
+                line = delimiter.join(tokens)
+        line = cleaner(line)
+        tokens = tokenizer.text2tokens(line)
+        if not write_vocabulary:
+            fout.write(" ".join(tokens) + "\n")
+        else:
+            for t in tokens:
+                counter[t] += 1
+    if not write_vocabulary:
+        return
+    ## FIXME
+    ## del duplicate add_symbols in counter
+    for symbol_and_id in add_symbol:
+        # e.g symbol="<blank>:0"
+        try:
+            symbol, idx = symbol_and_id.split(":")
+        except ValueError:
+            raise RuntimeError(f"Format error: e.g. '<blank>:0': {symbol_and_id}")
+        symbol = symbol.strip()
+        if symbol in counter:
+            del counter[symbol]
+    # ======= write_vocabulary mode from here =======
+    # Sort by the number of occurrences in descending order
+    # and filter lower frequency words than cutoff value
+    words_and_counts = list(
+        filter(lambda x: x[1] > cutoff, sorted(counter.items(), key=lambda x: -x[1]))
+    )
+    # Restrict the vocabulary size
+    if vocabulary_size > 0:
+        if vocabulary_size < len(add_symbol):
+            raise RuntimeError(f"vocabulary_size is too small: {vocabulary_size}")
+        words_and_counts = words_and_counts[: vocabulary_size - len(add_symbol)]
+    # Parse the values of --add_symbol
+    for symbol_and_id in add_symbol:
+        # e.g symbol="<blank>:0"
+        try:
+            symbol, idx = symbol_and_id.split(":")
+            idx = int(idx)
+        except ValueError:
+            raise RuntimeError(f"Format error: e.g. '<blank>:0': {symbol_and_id}")
+        symbol = symbol.strip()
+        # e.g. idx=0  -> append as the first symbol
+        # e.g. idx=-1 -> append as the last symbol
+        if idx < 0:
+            idx = len(words_and_counts) + 1 + idx
+        words_and_counts.insert(idx, (symbol, None))
+    # Write words
+    for w, c in words_and_counts:
+        fout.write(w + "\n")
+    # Logging
+    total_count = sum(counter.values())
+    invocab_count = sum(c for w, c in words_and_counts if c is not None)
+    logging.info(f"OOV rate = {(total_count - invocab_count) / total_count * 100} %")
+def get_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(
+        description="Tokenize texts",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser.add_argument(
+        "--log_level",
+        type=lambda x: x.upper(),
+        default="INFO",
+        choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"),
+        help="The verbose level of logging",
+    )
+    parser.add_argument(
+        "--input", "-i", required=True, help="Input text. - indicates sys.stdin"
+    )
+    parser.add_argument(
+        "--output", "-o", required=True, help="Output text. - indicates sys.stdout"
+    )
+    parser.add_argument(
+        "--field",
+        "-f",
+        help="The target columns of the input text as 1-based integer. e.g 2-",
+    )
+    parser.add_argument(
+        "--token_type",
+        "-t",
+        default="char",
+        choices=["char", "bpe", "word", "phn"],
+        help="Token type",
+    )
+    parser.add_argument("--delimiter", "-d", default=None, help="The delimiter")
+    parser.add_argument("--space_symbol", default="<space>", help="The space symbol")
+    parser.add_argument("--bpemodel", default=None, help="The bpemodel file path")
+    parser.add_argument(
+        "--non_linguistic_symbols",
+        type=str_or_none,
+        help="non_linguistic_symbols file path",
+    )
+    parser.add_argument(
+        "--remove_non_linguistic_symbols",
+        type=str2bool,
+        default=False,
+        help="Remove non-language-symbols from tokens",
+    )
+    parser.add_argument(
+        "--cleaner",
+        type=str_or_none,
+        choices=[None, "tacotron", "jaconv", "vietnamese", "korean_cleaner"],
+        default=None,
+        help="Apply text cleaning",
+    )
+    parser.add_argument(
+        "--g2p",
+        type=str_or_none,
+        choices=g2p_classes,
+        default=None,
+        help="Specify g2p method if --token_type=phn",
+    )
+    group = parser.add_argument_group("write_vocabulary mode related")
+    group.add_argument(
+        "--write_vocabulary",
+        type=str2bool,
+        default=False,
+        help="Write tokens list instead of tokenized text per line",
+    )
+    group.add_argument("--vocabulary_size", type=int, default=0, help="Vocabulary size")
+    group.add_argument(
+        "--cutoff",
+        default=0,
+        type=int,
+        help="cut-off frequency used for write-vocabulary mode",
+    )
+    group.add_argument(
+        "--add_symbol",
+        type=str,
+        default=[],
+        action="append",
+        help="Append symbol e.g. --add_symbol '<blank>:0' --add_symbol '<unk>:1'",
+    )
+    return parser
+def main(cmd=None):
+    print(get_commandline_args(), file=sys.stderr)
+    parser = get_parser()
+    args = parser.parse_args(cmd)
+    kwargs = vars(args)
+    tokenize(**kwargs)
+if __name__ == "__main__":
+    main()

demo/Step-Audio-EditX/funasr_detach/bin/train.py ADDED Viewed

	@@ -0,0 +1,227 @@

+#!/usr/bin/env python3
+# -*- encoding: utf-8 -*-
+import os
+import sys
+import torch
+import hydra
+import logging
+import argparse
+from io import BytesIO
+import torch.distributed as dist
+from collections.abc import Sequence
+from omegaconf import DictConfig, OmegaConf
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+from funasr_detach.register import tables
+from funasr_detach.optimizers import optim_classes
+from funasr_detach.train_utils.trainer import Trainer
+from funasr_detach.schedulers import scheduler_classes
+from funasr_detach.train_utils.initialize import initialize
+from funasr_detach.download.download_from_hub import download_model
+from funasr_detach.models.lora.utils import mark_only_lora_as_trainable
+from funasr_detach.train_utils.set_all_random_seed import set_all_random_seed
+from funasr_detach.train_utils.load_pretrained_model import load_pretrained_model
+# from funasr_detach.tokenizer.build_tokenizer import build_tokenizer
+# from funasr_detach.tokenizer.token_id_converter import TokenIDConverter
+# from funasr_detach.tokenizer.funtoken import build_tokenizer
+@hydra.main(config_name=None, version_base=None)
+def main_hydra(kwargs: DictConfig):
+    if kwargs.get("debug", False):
+        import pdb
+        pdb.set_trace()
+    assert "model" in kwargs
+    if "model_conf" not in kwargs:
+        logging.info(
+            "download models from model hub: {}".format(kwargs.get("model_hub", "ms"))
+        )
+        kwargs = download_model(is_training=kwargs.get("is_training", True), **kwargs)
+    main(**kwargs)
+def main(**kwargs):
+    print(kwargs)
+    # set random seed
+    set_all_random_seed(kwargs.get("seed", 0))
+    torch.backends.cudnn.enabled = kwargs.get(
+        "cudnn_enabled", torch.backends.cudnn.enabled
+    )
+    torch.backends.cudnn.benchmark = kwargs.get(
+        "cudnn_benchmark", torch.backends.cudnn.benchmark
+    )
+    torch.backends.cudnn.deterministic = kwargs.get("cudnn_deterministic", True)
+    local_rank = int(os.environ.get("LOCAL_RANK", 0))
+    if local_rank == 0:
+        tables.print()
+    # Check if we are using DDP or FSDP
+    use_ddp = "WORLD_SIZE" in os.environ and int(os.environ["WORLD_SIZE"]) > 1
+    use_fsdp = kwargs.get("use_fsdp", None)
+    if use_ddp or use_fsdp:
+        dist.init_process_group(
+            backend=kwargs.get("backend", "nccl"), init_method="env://"
+        )
+        torch.cuda.set_device(local_rank)
+    # save config.yaml
+    if (
+        (use_ddp or use_fsdp)
+        and dist.get_rank() == 0
+        or not (use_ddp or use_fsdp)
+        and local_rank == 0
+    ):
+        os.makedirs(kwargs.get("output_dir", "./"), exist_ok=True)
+        yaml_file = os.path.join(kwargs.get("output_dir", "./"), "config.yaml")
+        OmegaConf.save(config=kwargs, f=yaml_file)
+        logging.info("config.yaml is saved to: %s", yaml_file)
+    tokenizer = kwargs.get("tokenizer", None)
+    if tokenizer is not None:
+        tokenizer_class = tables.tokenizer_classes.get(tokenizer)
+        tokenizer = tokenizer_class(**kwargs["tokenizer_conf"])
+        kwargs["tokenizer"] = tokenizer
+    # build frontend if frontend is none None
+    frontend = kwargs.get("frontend", None)
+    if frontend is not None:
+        frontend_class = tables.frontend_classes.get(frontend)
+        frontend = frontend_class(**kwargs["frontend_conf"])
+        kwargs["frontend"] = frontend
+        kwargs["input_size"] = frontend.output_size()
+    # build model
+    model_class = tables.model_classes.get(kwargs["model"])
+    model = model_class(
+        **kwargs, **kwargs["model_conf"], vocab_size=len(tokenizer.token_list)
+    )
+    # init_param
+    init_param = kwargs.get("init_param", None)
+    if init_param is not None:
+        if not isinstance(init_param, (list, tuple)):
+            init_param = (init_param,)
+        logging.info("init_param is not None: %s", init_param)
+        for p in init_param:
+            logging.info(f"Loading pretrained params from {p}")
+            load_pretrained_model(
+                model=model,
+                path=p,
+                ignore_init_mismatch=kwargs.get("ignore_init_mismatch", True),
+                oss_bucket=kwargs.get("oss_bucket", None),
+                scope_map=kwargs.get("scope_map", None),
+                excludes=kwargs.get("excludes", None),
+            )
+    else:
+        initialize(model, kwargs.get("init", "kaiming_normal"))
+    # freeze_param
+    freeze_param = kwargs.get("freeze_param", None)
+    if freeze_param is not None:
+        freeze_param = eval(freeze_param)
+        if isinstance(freeze_param, Sequence):
+            freeze_param = (freeze_param,)
+        logging.info("freeze_param is not None: %s", freeze_param)
+        for t in freeze_param:
+            for k, p in model.named_parameters():
+                if k.startswith(t + ".") or k == t:
+                    logging.info(f"Setting {k}.requires_grad = False")
+                    p.requires_grad = False
+    if use_ddp:
+        model = model.cuda(local_rank)
+        model = DDP(
+            model,
+            device_ids=[local_rank],
+            find_unused_parameters=kwargs.get("train_conf", {}).get(
+                "find_unused_parameters", False
+            ),
+        )
+    elif use_fsdp:
+        model = FSDP(model).cuda(local_rank)
+    else:
+        model = model.to(device=kwargs.get("device", "cuda"))
+    # optim
+    optim = kwargs.get("optim", "adam")
+    assert optim in optim_classes
+    optim_class = optim_classes.get(optim)
+    optim = optim_class(model.parameters(), **kwargs.get("optim_conf"))
+    # scheduler
+    scheduler = kwargs.get("scheduler", "warmuplr")
+    assert scheduler in scheduler_classes
+    scheduler_class = scheduler_classes.get(scheduler)
+    scheduler = scheduler_class(optim, **kwargs.get("scheduler_conf"))
+    # dataset
+    dataset_class = tables.dataset_classes.get(kwargs.get("dataset", "AudioDataset"))
+    dataset_tr = dataset_class(
+        kwargs.get("train_data_set_list"),
+        frontend=frontend,
+        tokenizer=tokenizer,
+        is_training=True,
+        **kwargs.get("dataset_conf"),
+    )
+    dataset_val = dataset_class(
+        kwargs.get("valid_data_set_list"),
+        frontend=frontend,
+        tokenizer=tokenizer,
+        is_training=False,
+        **kwargs.get("dataset_conf"),
+    )
+    # dataloader
+    batch_sampler = kwargs["dataset_conf"].get(
+        "batch_sampler", "DynamicBatchLocalShuffleSampler"
+    )
+    batch_sampler_val = None
+    if batch_sampler is not None:
+        batch_sampler_class = tables.batch_sampler_classes.get(batch_sampler)
+        batch_sampler = batch_sampler_class(dataset_tr, **kwargs.get("dataset_conf"))
+        batch_sampler_val = batch_sampler_class(
+            dataset_val, is_training=False, **kwargs.get("dataset_conf")
+        )
+    dataloader_tr = torch.utils.data.DataLoader(
+        dataset_tr,
+        collate_fn=dataset_tr.collator,
+        batch_sampler=batch_sampler,
+        num_workers=kwargs.get("dataset_conf").get("num_workers", 4),
+        pin_memory=True,
+    )
+    dataloader_val = torch.utils.data.DataLoader(
+        dataset_val,
+        collate_fn=dataset_val.collator,
+        batch_sampler=batch_sampler_val,
+        num_workers=kwargs.get("dataset_conf").get("num_workers", 4),
+        pin_memory=True,
+    )
+    trainer = Trainer(
+        model=model,
+        optim=optim,
+        scheduler=scheduler,
+        dataloader_train=dataloader_tr,
+        dataloader_val=dataloader_val,
+        local_rank=local_rank,
+        use_ddp=use_ddp,
+        use_fsdp=use_fsdp,
+        output_dir=kwargs.get("output_dir", "./exp"),
+        resume=kwargs.get("resume", True),
+        **kwargs.get("train_conf"),
+    )
+    trainer.run()
+    if use_ddp or use_fsdp:
+        torch.distributed.destroy_process_group()
+if __name__ == "__main__":
+    main_hydra()

demo/Step-Audio-EditX/funasr_detach/datasets/__init__.py ADDED Viewed

File without changes

demo/Step-Audio-EditX/funasr_detach/datasets/audio_datasets/__init__.py ADDED Viewed

File without changes

demo/Step-Audio-EditX/funasr_detach/datasets/audio_datasets/datasets.py ADDED Viewed

	@@ -0,0 +1,112 @@

+import torch
+from funasr_detach.register import tables
+from funasr_detach.utils.load_utils import extract_fbank, load_audio_text_image_video
+@tables.register("dataset_classes", "AudioDataset")
+class AudioDataset(torch.utils.data.Dataset):
+    """
+    AudioDataset
+    """
+    def __init__(
+        self,
+        path,
+        index_ds: str = None,
+        frontend=None,
+        tokenizer=None,
+        int_pad_value: int = -1,
+        float_pad_value: float = 0.0,
+        **kwargs
+    ):
+        super().__init__()
+        index_ds_class = tables.index_ds_classes.get(index_ds)
+        self.index_ds = index_ds_class(path, **kwargs)
+        preprocessor_speech = kwargs.get("preprocessor_speech", None)
+        if preprocessor_speech:
+            preprocessor_speech_class = tables.preprocessor_classes.get(
+                preprocessor_speech
+            )
+            preprocessor_speech = preprocessor_speech_class(
+                **kwargs.get("preprocessor_speech_conf")
+            )
+        self.preprocessor_speech = preprocessor_speech
+        preprocessor_text = kwargs.get("preprocessor_text", None)
+        if preprocessor_text:
+            preprocessor_text_class = tables.preprocessor_classes.get(preprocessor_text)
+            preprocessor_text = preprocessor_text_class(
+                **kwargs.get("preprocessor_text_conf")
+            )
+        self.preprocessor_text = preprocessor_text
+        self.frontend = frontend
+        self.fs = 16000 if frontend is None else frontend.fs
+        self.data_type = "sound"
+        self.tokenizer = tokenizer
+        self.int_pad_value = int_pad_value
+        self.float_pad_value = float_pad_value
+    def get_source_len(self, index):
+        item = self.index_ds[index]
+        return self.index_ds.get_source_len(item)
+    def get_target_len(self, index):
+        item = self.index_ds[index]
+        return self.index_ds.get_target_len(item)
+    def __len__(self):
+        return len(self.index_ds)
+    def __getitem__(self, index):
+        item = self.index_ds[index]
+        # import pdb;
+        # pdb.set_trace()
+        source = item["source"]
+        data_src = load_audio_text_image_video(source, fs=self.fs)
+        if self.preprocessor_speech:
+            data_src = self.preprocessor_speech(data_src, fs=self.fs)
+        speech, speech_lengths = extract_fbank(
+            data_src, data_type=self.data_type, frontend=self.frontend, is_final=True
+        )  # speech: [b, T, d]
+        target = item["target"]
+        if self.preprocessor_text:
+            target = self.preprocessor_text(target)
+        if self.tokenizer:
+            ids = self.tokenizer.encode(target)
+            text = torch.tensor(ids, dtype=torch.int64)
+        else:
+            ids = target
+            text = ids
+        ids_lengths = len(ids)
+        text_lengths = torch.tensor([ids_lengths], dtype=torch.int32)
+        return {
+            "speech": speech[0, :, :],
+            "speech_lengths": speech_lengths,
+            "text": text,
+            "text_lengths": text_lengths,
+        }
+    def collator(self, samples: list = None):
+        outputs = {}
+        for sample in samples:
+            for key in sample.keys():
+                if key not in outputs:
+                    outputs[key] = []
+                outputs[key].append(sample[key])
+        for key, data_list in outputs.items():
+            if isinstance(data_list[0], torch.Tensor):
+                if data_list[0].dtype == torch.int64:
+                    pad_value = self.int_pad_value
+                else:
+                    pad_value = self.float_pad_value
+                outputs[key] = torch.nn.utils.rnn.pad_sequence(
+                    data_list, batch_first=True, padding_value=pad_value
+                )
+        return outputs

demo/Step-Audio-EditX/funasr_detach/datasets/audio_datasets/index_ds.py ADDED Viewed

	@@ -0,0 +1,150 @@

+import os
+import json
+import torch
+import logging
+import concurrent.futures
+import librosa
+import torch.distributed as dist
+from funasr_detach.register import tables
+@tables.register("index_ds_classes", "IndexDSJsonlRankSplit")
+class IndexDSJsonlRankSplit(torch.utils.data.Dataset):
+    def __init__(self, path):
+        super().__init__()
+        contents = []
+        with open(path, encoding="utf-8") as fin:
+            for line in fin:
+                data = json.loads(line.strip())
+                if "text" in data:  # for sft
+                    self.contents.append(data["text"])
+                if "source" in data:  # for speech lab pretrain
+                    prompt = data["prompt"]
+                    source = data["source"]
+                    target = data["target"]
+                    source_len = data["source_len"]
+                    target_len = data["target_len"]
+                    contents.append(
+                        {
+                            "source": source,
+                            "prompt": prompt,
+                            "target": target,
+                            "source_len": source_len,
+                            "target_len": target_len,
+                        }
+                    )
+        self.contents = []
+        total_num = len(contents)
+        try:
+            rank = dist.get_rank()
+            world_size = dist.get_world_size()
+        except:
+            rank = 0
+            world_size = 1
+            logging.warning("distributed is not initialized, only single shard")
+        num_per_rank = total_num // world_size
+        # rank = 0
+        # import ipdb; ipdb.set_trace()
+        self.contents = contents[rank * num_per_rank : (rank + 1) * num_per_rank]
+        logging.info(
+            "in rank: {}, num of samplers: {}, total_num of samplers across ranks: {}".format(
+                rank, len(self.contents), len(contents)
+            )
+        )
+    def __len__(self):
+        return len(self.contents)
+    def __getitem__(self, index):
+        try:
+            data = self.contents[index]
+        except:
+            print(index)
+        return data
+    def get_source_len(self, data_dict):
+        return data_dict["source_len"]
+    def get_target_len(self, data_dict):
+        return data_dict["target_len"] if "target_len" in data_dict else 0
+@tables.register("index_ds_classes", "IndexDSJsonl")
+@tables.register("index_ds_classes", "IndexDSJsonlRankFull")
+class IndexDSJsonlRankFull(torch.utils.data.Dataset):
+    def __init__(self, path: str, **kwargs):
+        super().__init__()
+        if isinstance(path, (list, tuple)):  # wav.scp, text.txt/text.trans
+            from funasr_detach.datasets.audio_datasets.scp2jsonl import (
+                gen_jsonl_from_wav_text_list,
+            )
+            jsonl_outdir = os.path.dirname(path[0])
+            jsonl_name = (
+                "datalist_train.jsonl"
+                if kwargs.get("is_training", True)
+                else "datalist_val.jsonl"
+            )
+            jsonl_file_out = os.path.join(jsonl_outdir, jsonl_name)
+            if not os.path.exists(jsonl_file_out):
+                print(f"datalist is: {path}, generate jsonl from it")
+                gen_jsonl_from_wav_text_list(
+                    path, jsonl_file_out=jsonl_file_out, **kwargs
+                )
+            path = jsonl_file_out
+        contents = []
+        with open(path, encoding="utf-8") as fin:
+            for line in fin:
+                data = json.loads(line.strip())
+                if "text" in data:  # for sft
+                    self.contents.append(data["text"])
+                if "source" in data:  # for speech lab pretrain
+                    prompt = data.get("prompt", "<ASR>")
+                    source = data["source"]
+                    target = data["target"]
+                    source_len = data.get("source_len", 1)
+                    target_len = data.get("target_len", 0)
+                    contents.append(
+                        {
+                            "source": source,
+                            "prompt": prompt,
+                            "target": target,
+                            "source_len": source_len,
+                            "target_len": target_len,
+                        }
+                    )
+        self.contents = contents
+        logging.info(
+            "total_num of samplers across ranks: {}".format(len(self.contents))
+        )
+    def __len__(self):
+        return len(self.contents)
+    def __getitem__(self, index):
+        try:
+            data = self.contents[index]
+        except:
+            print(index)
+        return data
+    def get_source_len(self, data_dict):
+        return data_dict.get("source_len", 1)
+    def get_target_len(self, data_dict):
+        return data_dict.get("target_len", 0)

demo/Step-Audio-EditX/funasr_detach/datasets/audio_datasets/preprocessor.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import os
+import json
+import torch
+import logging
+import concurrent.futures
+import librosa
+import torch.distributed as dist
+from typing import Collection
+import torch
+import torchaudio
+from torch import nn
+import random
+import re
+from funasr_detach.tokenizer.cleaner import TextCleaner
+from funasr_detach.register import tables
+@tables.register("preprocessor_classes", "SpeechPreprocessSpeedPerturb")
+class SpeechPreprocessSpeedPerturb(nn.Module):
+    def __init__(self, speed_perturb: list = None, **kwargs):
+        super().__init__()
+        self.speed_perturb = speed_perturb
+    def forward(self, waveform, fs, **kwargs):
+        if self.speed_perturb is None:
+            return waveform
+        speed = random.choice(self.speed_perturb)
+        if speed != 1.0:
+            if not isinstance(waveform, torch.Tensor):
+                waveform = torch.tensor(waveform)
+            waveform, _ = torchaudio.sox_effects.apply_effects_tensor(
+                waveform.view(1, -1), fs, [["speed", str(speed)], ["rate", str(fs)]]
+            )
+            waveform = waveform.view(-1)
+        return waveform
+@tables.register("preprocessor_classes", "TextPreprocessSegDict")
+class TextPreprocessSegDict(nn.Module):
+    def __init__(
+        self,
+        seg_dict: str = None,
+        text_cleaner: Collection[str] = None,
+        split_with_space: bool = False,
+        **kwargs
+    ):
+        super().__init__()
+        self.text_cleaner = TextCleaner(text_cleaner)
+    def forward(self, text, **kwargs):
+        text = self.text_cleaner(text)
+        return text

demo/Step-Audio-EditX/funasr_detach/datasets/audio_datasets/samplers.py ADDED Viewed

	@@ -0,0 +1,306 @@

+import torch
+import numpy as np
+import logging
+import torch.distributed as dist
+from funasr_detach.register import tables
+@tables.register("batch_sampler_classes", "DynamicBatchLocalShuffleSampler")
+class BatchSampler(torch.utils.data.BatchSampler):
+    def __init__(
+        self,
+        dataset,
+        batch_type: str = "example",
+        batch_size: int = 100,
+        buffer_size: int = 30,
+        drop_last: bool = False,
+        shuffle: bool = True,
+        is_training: bool = True,
+        **kwargs
+    ):
+        self.drop_last = drop_last
+        self.pre_idx = -1
+        self.dataset = dataset
+        self.total_samples = len(dataset)
+        self.batch_type = batch_type
+        self.batch_size = int(batch_size)
+        self.buffer_size = buffer_size
+        self.max_token_length = kwargs.get("max_token_length", 5000)
+        self.shuffle_idx = np.arange(self.total_samples)
+        self.shuffle = shuffle and is_training
+        self.length_scale_source = kwargs.get("length_scale_source", 1.0)
+    def __len__(self):
+        return (self.total_samples - 1) // self.batch_size + 1
+    def set_epoch(self, epoch):
+        np.random.seed(epoch)
+    def __iter__(self):
+        if self.shuffle:
+            np.random.shuffle(self.shuffle_idx)
+        batch = []
+        max_token = 0
+        num_sample = 0
+        iter_num = (self.total_samples - 1) // self.buffer_size + 1
+        # print("iter_num: ", iter_num)
+        for iter in range(self.pre_idx + 1, iter_num):
+            datalen_with_index = []
+            for i in range(self.buffer_size):
+                idx = iter * self.buffer_size + i
+                if idx >= self.total_samples:
+                    continue
+                idx_map = self.shuffle_idx[idx]
+                # prompt = self.dataset.indexed_dataset[idx_map]["prompt"]
+                target_len = (
+                    self.dataset.get_target_len(idx_map)
+                    if self.batch_type == "length"
+                    else 0.0
+                )
+                source_len = (
+                    self.dataset.get_source_len(idx_map) / self.length_scale_source
+                )
+                sample_len_cur = source_len + target_len
+                datalen_with_index.append([idx, sample_len_cur])
+            datalen_with_index_sort = sorted(datalen_with_index, key=lambda x: x[1])
+            for item in datalen_with_index_sort:
+                idx, sample_len_cur_raw = item
+                if sample_len_cur_raw > self.max_token_length:
+                    continue
+                max_token_cur = max(max_token, sample_len_cur_raw)
+                max_token_padding = 1 + num_sample
+                if self.batch_type != "example":
+                    max_token_padding *= max_token_cur
+                if max_token_padding <= self.batch_size:
+                    batch.append(idx)
+                    max_token = max_token_cur
+                    num_sample += 1
+                else:
+                    yield batch
+                    batch = [idx]
+                    max_token = sample_len_cur_raw
+                    num_sample = 1
+@tables.register("batch_sampler_classes", "BatchSampler")
+@tables.register("batch_sampler_classes", "RankFullLocalShuffleBatchSampler")
+class RankFullLocalShuffleBatchSampler(torch.utils.data.BatchSampler):
+    def __init__(
+        self,
+        dataset,
+        batch_type: str = "example",
+        batch_size: int = 100,
+        buffer_size: int = 30,
+        drop_last: bool = True,
+        shuffle: bool = True,
+        is_training: bool = True,
+        **kwargs
+    ):
+        self.drop_last = drop_last
+        self.pre_idx = -1
+        self.dataset = dataset
+        self.total_samples = len(dataset)
+        self.batch_type = batch_type
+        self.batch_size = int(batch_size)
+        self.buffer_size = buffer_size
+        self.max_token_length = kwargs.get("max_token_length", 1500)
+        self.shuffle_idx = np.arange(self.total_samples)
+        self.shuffle = shuffle and is_training
+        self.length_scale_source = kwargs.get("length_scale_source", 1.0)
+        try:
+            rank = dist.get_rank()
+            world_size = dist.get_world_size()
+        except:
+            rank = 0
+            world_size = 1
+        self.rank = rank
+        self.world_size = world_size
+    def __len__(self):
+        return (self.total_samples - 1) // (self.batch_size * self.world_size) + 1
+    def set_epoch(self, epoch):
+        np.random.seed(epoch)
+    def __iter__(self):
+        batch_size_total = self.batch_size * self.world_size
+        if self.shuffle:
+            np.random.shuffle(self.shuffle_idx)
+        batch = []
+        max_token = 0
+        num_sample = 0
+        iter_num = (self.total_samples - 1) // self.buffer_size + 1
+        # print("iter_num: ", iter_num)
+        for iter in range(self.pre_idx + 1, iter_num):
+            # if iter == iter_num -1 and self.drop_last:
+            #     continue
+            datalen_with_index = []
+            for i in range(self.buffer_size):
+                idx = iter * self.buffer_size + i
+                if idx >= self.total_samples:
+                    continue
+                idx_map = self.shuffle_idx[idx]
+                # prompt = self.dataset.indexed_dataset[idx_map]["prompt"]
+                source_len = (
+                    self.dataset.get_source_len(idx_map) / self.length_scale_source
+                )
+                target_len = (
+                    self.dataset.get_target_len(idx_map)
+                    if self.batch_type == "length"
+                    else 0.0
+                )
+                sample_len_cur = source_len + target_len
+                datalen_with_index.append([idx, sample_len_cur])
+            datalen_with_index_sort = sorted(datalen_with_index, key=lambda x: x[1])
+            for item in datalen_with_index_sort:
+                idx, sample_len_cur_raw = item
+                if sample_len_cur_raw > self.max_token_length:
+                    continue
+                max_token_cur = max(max_token, sample_len_cur_raw)
+                max_token_padding = 1 + num_sample
+                # if self.batch_type != 'example':
+                #     max_token_padding *= max_token_cur
+                if max_token_padding <= batch_size_total:
+                    batch.append(idx)
+                    max_token = max_token_cur
+                    num_sample += 1
+                else:
+                    batch_rank = batch[
+                        self.rank * self.batch_size : (self.rank + 1) * self.batch_size
+                    ]
+                    yield batch_rank
+                    batch = [idx]
+                    max_token = sample_len_cur_raw
+                    num_sample = 1
+@tables.register("batch_sampler_classes", "RankFullLocalShuffleDynamicBatchSampler")
+class RankFullLocalShuffleDynamicBatchSampler(torch.utils.data.BatchSampler):
+    def __init__(
+        self,
+        dataset,
+        batch_type: str = "example",
+        batch_size: int = 100,
+        buffer_size: int = 30,
+        drop_last: bool = True,
+        shuffle: bool = True,
+        is_training: bool = True,
+        **kwargs
+    ):
+        self.drop_last = drop_last
+        self.pre_idx = -1
+        self.dataset = dataset
+        self.total_samples = len(dataset)
+        self.batch_type = batch_type
+        self.batch_size = int(batch_size)
+        self.buffer_size = buffer_size
+        self.max_token_length = kwargs.get("max_token_length", 1500)
+        self.shuffle_idx = np.arange(self.total_samples)
+        self.shuffle = shuffle and is_training
+        self.length_scale_source = kwargs.get("length_scale_source", 1.0)
+        try:
+            rank = dist.get_rank()
+            world_size = dist.get_world_size()
+        except:
+            rank = 0
+            world_size = 1
+        self.rank = rank
+        self.world_size = world_size
+    def __len__(self):
+        return (self.total_samples - 1) // (self.batch_size * self.world_size) + 1
+    def set_epoch(self, epoch):
+        np.random.seed(epoch)
+    def __iter__(self):
+        batch_size_total = self.batch_size * self.world_size
+        if self.shuffle:
+            np.random.shuffle(self.shuffle_idx)
+        batch_list_all_rank = []
+        batch_list_cur = []
+        max_token = 0
+        num_sample = 0
+        iter_num = (self.total_samples - 1) // self.buffer_size + 1
+        # print("iter_num: ", iter_num)
+        for iter in range(self.pre_idx + 1, iter_num):
+            # if iter == iter_num - 1 and self.drop_last:
+            #     continue
+            datalen_with_index = []
+            for i in range(self.buffer_size):
+                idx = iter * self.buffer_size + i
+                if idx >= self.total_samples:
+                    continue
+                idx_map = self.shuffle_idx[idx]
+                # prompt = self.dataset.indexed_dataset[idx_map]["prompt"]
+                source_len = (
+                    self.dataset.get_source_len(idx_map) / self.length_scale_source
+                )
+                target_len = (
+                    self.dataset.get_target_len(idx_map)
+                    if self.batch_type == "length"
+                    else 0.0
+                )
+                sample_len_cur = source_len + target_len
+                datalen_with_index.append([idx, sample_len_cur])
+            datalen_with_index_sort = sorted(datalen_with_index, key=lambda x: x[1])
+            for ii, item in enumerate(datalen_with_index_sort):
+                is_last_batch = iter == iter_num - 1 and ii == len(
+                    datalen_with_index_sort
+                )
+                idx, sample_len_cur_raw = item
+                if sample_len_cur_raw > self.max_token_length:
+                    continue
+                max_token_cur = max(max_token, sample_len_cur_raw)
+                max_token_padding = 1 + num_sample
+                if self.batch_type != "example":
+                    max_token_padding *= max_token_cur
+                if len(batch_list_all_rank) < self.world_size:
+                    if max_token_padding <= self.batch_size:
+                        batch_list_cur.append(idx)
+                        max_token = max_token_cur
+                        num_sample += 1
+                    else:
+                        batch_list_all_rank.append(batch_list_cur)
+                        batch_list_cur = []
+                else:
+                    batch_rank = batch_list_all_rank[self.rank]
+                    yield batch_rank
+                    batch_list_all_rank = [idx]
+                    max_token = sample_len_cur_raw
+                    num_sample = 1

demo/Step-Audio-EditX/funasr_detach/datasets/audio_datasets/scp2jsonl.py ADDED Viewed

	@@ -0,0 +1,116 @@

+import os
+import json
+import torch
+import logging
+import hydra
+from omegaconf import DictConfig, OmegaConf
+import concurrent.futures
+import librosa
+import torch.distributed as dist
+def gen_jsonl_from_wav_text_list(
+    path, data_type_list=("source", "target"), jsonl_file_out: str = None, **kwargs
+):
+    try:
+        rank = dist.get_rank()
+        world_size = dist.get_world_size()
+    except:
+        rank = 0
+        world_size = 1
+    cpu_cores = os.cpu_count() or 1
+    print(f"convert wav.scp text to jsonl, ncpu: {cpu_cores}")
+    if rank == 0:
+        json_dict = {}
+        for data_type, data_file in zip(data_type_list, path):
+            json_dict[data_type] = {}
+            with open(data_file, "r") as f:
+                data_file_lists = f.readlines()
+                lines_for_each_th = (len(data_file_lists) - 1) // cpu_cores + 1
+                task_num = cpu_cores if len(data_file_lists) > cpu_cores else 1
+                with concurrent.futures.ThreadPoolExecutor(
+                    max_workers=cpu_cores
+                ) as executor:
+                    futures = [
+                        executor.submit(
+                            parse_context_length,
+                            data_file_lists[
+                                i * lines_for_each_th : (i + 1) * lines_for_each_th
+                            ],
+                            data_type,
+                        )
+                        for i in range(task_num)
+                    ]
+                    for future in concurrent.futures.as_completed(futures):
+                        json_dict[data_type].update(future.result())
+            # print(json_dict)
+        with open(jsonl_file_out, "w") as f:
+            for key in json_dict[data_type_list[0]].keys():
+                jsonl_line = {"key": key}
+                for data_file in data_type_list:
+                    jsonl_line.update(json_dict[data_file][key])
+                jsonl_line = json.dumps(jsonl_line, ensure_ascii=False)
+                f.write(jsonl_line + "\n")
+                f.flush()
+    else:
+        pass
+    if world_size > 1:
+        dist.barrier()
+def parse_context_length(data_list: list, data_type: str):
+    res = {}
+    for i, line in enumerate(data_list):
+        key, line = line.strip().split(maxsplit=1)
+        line = line.strip()
+        if os.path.exists(line):
+            waveform, _ = librosa.load(line, sr=16000)
+            sample_num = len(waveform)
+            context_len = int(sample_num // 16000 * 1000 / 10)
+        else:
+            context_len = len(line.split()) if " " in line else len(line)
+        res[key] = {data_type: line, f"{data_type}_len": context_len}
+    return res
+@hydra.main(config_name=None, version_base=None)
+def main_hydra(cfg: DictConfig):
+    kwargs = OmegaConf.to_container(cfg, resolve=True)
+    scp_file_list = kwargs.get(
+        "scp_file_list",
+        (
+            "/Users/zhifu/funasr1.0/test_local/wav.scp",
+            "/Users/zhifu/funasr1.0/test_local/text.txt",
+        ),
+    )
+    if isinstance(scp_file_list, str):
+        scp_file_list = eval(scp_file_list)
+    data_type_list = kwargs.get("data_type_list", ("source", "target"))
+    jsonl_file_out = kwargs.get(
+        "jsonl_file_out", "/Users/zhifu/funasr1.0/test_local/audio_datasets.jsonl"
+    )
+    gen_jsonl_from_wav_text_list(
+        scp_file_list, data_type_list=data_type_list, jsonl_file_out=jsonl_file_out
+    )
+"""
+python -m funasr_detach.datasets.audio_datasets.scp2jsonl \
+++scp_file_list='["/Users/zhifu/funasr1.0/test_local/wav.scp", "/Users/zhifu/funasr1.0/test_local/text.txt"]' \
+++data_type_list='["source", "target"]' \
+++jsonl_file_out=/Users/zhifu/funasr1.0/test_local/audio_datasets.jsonl
+"""
+if __name__ == "__main__":
+    main_hydra()

demo/Step-Audio-EditX/funasr_detach/download/__init__.py ADDED Viewed

File without changes

demo/Step-Audio-EditX/funasr_detach/download/download_dataset_from_hub.py ADDED Viewed

	@@ -0,0 +1,19 @@

+def download_dataset():
+    pass
+def download_dataset_from_ms(**kwargs):
+    from modelscope.msdatasets import MsDataset
+    dataset_name = kwargs.get(
+        "dataset_name", "speech_asr/speech_asr_aishell1_trainsets"
+    )
+    subset_name = kwargs.get("subset_name", "default")
+    split = kwargs.get("split", "train")
+    data_dump_dir = kwargs.get("data_dump_dir", None)
+    ds = MsDataset.load(
+        dataset_name=dataset_name,
+        subset_name=subset_name,
+        split=split,
+        cache_dir=data_dump_dir,
+    )

demo/Step-Audio-EditX/funasr_detach/download/download_from_hub.py ADDED Viewed

	@@ -0,0 +1,231 @@

+import os
+import json
+import threading
+from omegaconf import OmegaConf
+from funasr_detach.download.name_maps_from_hub import name_maps_ms, name_maps_hf
+# Global cache for downloaded models to avoid repeated downloads
+# Key: (repo_id, model_revision, model_hub)
+# Value: repo_cache_dir
+_model_cache = {}
+_cache_lock = threading.Lock()
+def download_model(**kwargs):
+    model_hub = kwargs.get("model_hub", "ms")
+    model_or_path = kwargs.get("model")
+    repo_path = kwargs.get("repo_path", "")
+    # Handle name mapping based on model_hub
+    if model_hub == "ms" and model_or_path in name_maps_ms:
+        model_or_path = name_maps_ms[model_or_path]
+    elif model_hub == "hf" and model_or_path in name_maps_hf:
+        model_or_path = name_maps_hf[model_or_path]
+    model_revision = kwargs.get("model_revision")
+    # Download model if it doesn't exist locally
+    if not os.path.exists(model_or_path):
+        if model_hub == "local":
+            # For local models, the path should already exist
+            raise FileNotFoundError(f"Local model path does not exist: {model_or_path}")
+        elif model_hub in ["ms", "hf"]:
+            repo_path, model_or_path = get_or_download_model_dir(
+                model_or_path,
+                model_revision,
+                is_training=kwargs.get("is_training"),
+                check_latest=kwargs.get("kwargs", True),
+                model_hub=model_hub,
+            )
+        else:
+            raise ValueError(f"Unsupported model_hub: {model_hub}")
+    print(f"Using model path: {model_or_path}")
+    kwargs["model_path"] = model_or_path
+    kwargs["repo_path"] = repo_path
+    # Common logic for processing configuration files (same for all model hubs)
+    if os.path.exists(os.path.join(model_or_path, "configuration.json")):
+        with open(
+            os.path.join(model_or_path, "configuration.json"), "r", encoding="utf-8"
+        ) as f:
+            conf_json = json.load(f)
+            cfg = {}
+            add_file_root_path(model_or_path, conf_json["file_path_metas"], cfg)
+            cfg.update(kwargs)
+            config = OmegaConf.load(cfg["config"])
+            kwargs = OmegaConf.merge(config, cfg)
+        kwargs["model"] = config["model"]
+    elif os.path.exists(os.path.join(model_or_path, "config.yaml")) and os.path.exists(
+        os.path.join(model_or_path, "model.pt")
+    ):
+        config = OmegaConf.load(os.path.join(model_or_path, "config.yaml"))
+        kwargs = OmegaConf.merge(config, kwargs)
+        init_param = os.path.join(model_or_path, "model.pb")
+        kwargs["init_param"] = init_param
+        if os.path.exists(os.path.join(model_or_path, "tokens.txt")):
+            kwargs["tokenizer_conf"]["token_list"] = os.path.join(
+                model_or_path, "tokens.txt"
+            )
+        if os.path.exists(os.path.join(model_or_path, "tokens.json")):
+            kwargs["tokenizer_conf"]["token_list"] = os.path.join(
+                model_or_path, "tokens.json"
+            )
+        if os.path.exists(os.path.join(model_or_path, "seg_dict")):
+            kwargs["tokenizer_conf"]["seg_dict"] = os.path.join(
+                model_or_path, "seg_dict"
+            )
+        if os.path.exists(os.path.join(model_or_path, "bpe.model")):
+            kwargs["tokenizer_conf"]["bpemodel"] = os.path.join(
+                model_or_path, "bpe.model"
+            )
+        kwargs["model"] = config["model"]
+        if os.path.exists(os.path.join(model_or_path, "am.mvn")):
+            kwargs["frontend_conf"]["cmvn_file"] = os.path.join(model_or_path, "am.mvn")
+        if os.path.exists(os.path.join(model_or_path, "jieba_usr_dict")):
+            kwargs["jieba_usr_dict"] = os.path.join(model_or_path, "jieba_usr_dict")
+    return OmegaConf.to_container(kwargs, resolve=True)
+def add_file_root_path(model_or_path: str, file_path_metas: dict, cfg={}):
+    if isinstance(file_path_metas, dict):
+        for k, v in file_path_metas.items():
+            if isinstance(v, str):
+                p = os.path.join(model_or_path, v)
+                if os.path.exists(p):
+                    cfg[k] = p
+            elif isinstance(v, dict):
+                if k not in cfg:
+                    cfg[k] = {}
+                add_file_root_path(model_or_path, v, cfg[k])
+    return cfg
+def get_or_download_model_dir(
+    model,
+    model_revision=None,
+    is_training=False,
+    check_latest=True,
+    model_hub="ms",
+):
+    """Get local model directory or download model if necessary.
+    Args:
+        model (str): model id or path to local model directory.
+                    For HF subfolders, use format: "repo_id/subfolder_path"
+        model_revision  (str, optional): model version number.
+        is_training (bool): Whether this is for training
+        check_latest (bool): Whether to check for latest version
+        model_hub (str): Model hub type ("ms" for ModelScope, "hf" for HuggingFace)
+    """
+    # Extract repo_id for caching (handle subfolder case)
+    if "/" in model and len(model.split("/")) > 2:
+        parts = model.split("/")
+        repo_id = "/".join(parts[:2])  # e.g., "organization/repo" or "stepfun-ai/Step-Audio-EditX"
+        subfolder = "/".join(parts[2:])  # e.g., "subfolder/model"
+    else:
+        repo_id = model
+        subfolder = None
+    # Create cache key
+    cache_key = (repo_id, model_revision, model_hub)
+    # Check cache first
+    with _cache_lock:
+        if cache_key in _model_cache:
+            cached_repo_dir = _model_cache[cache_key]
+            print(f"Using cached model for {repo_id}: {cached_repo_dir}")
+            # For subfolder case, construct the model_cache_dir from cached repo
+            if subfolder:
+                model_cache_dir = os.path.join(cached_repo_dir, subfolder)
+                if not os.path.exists(model_cache_dir):
+                    raise FileNotFoundError(f"Subfolder {subfolder} not found in cached repo {repo_id}")
+            else:
+                model_cache_dir = cached_repo_dir
+            return cached_repo_dir, model_cache_dir
+    # Cache miss, need to download
+    if model_hub == "ms":
+        # ModelScope download
+        from modelscope.hub.snapshot_download import snapshot_download
+        from modelscope.utils.constant import Invoke, ThirdParty
+        key = Invoke.LOCAL_TRAINER if is_training else Invoke.PIPELINE
+        # Download the repo (use repo_id, not the full model path with subfolder)
+        repo_cache_dir = snapshot_download(
+            repo_id,
+            revision=model_revision,
+            user_agent={Invoke.KEY: key, ThirdParty.KEY: "funasr"},
+        )
+        repo_cache_dir = normalize_cache_path(repo_cache_dir)
+        # Construct model_cache_dir
+        if subfolder:
+            model_cache_dir = os.path.join(repo_cache_dir, subfolder)
+            if not os.path.exists(model_cache_dir):
+                raise FileNotFoundError(f"Subfolder {subfolder} not found in downloaded repo {repo_id}")
+        else:
+            model_cache_dir = normalize_cache_path(repo_cache_dir)
+    elif model_hub == "hf":
+        # HuggingFace download
+        try:
+            from huggingface_hub import snapshot_download
+        except ImportError:
+            raise ImportError(
+                "huggingface_hub is required for downloading from HuggingFace. "
+                "Please install it with: pip install huggingface_hub"
+            )
+        # Download the repo (use repo_id, not the full model path with subfolder)
+        repo_cache_dir = snapshot_download(
+            repo_id=repo_id,
+            revision=model_revision,
+            allow_patterns=None,  # Download all files to ensure resource files are available
+        )
+        repo_cache_dir = normalize_cache_path(repo_cache_dir)
+        # Construct model_cache_dir
+        if subfolder:
+            model_cache_dir = os.path.join(repo_cache_dir, subfolder)
+            if not os.path.exists(model_cache_dir):
+                raise FileNotFoundError(f"Subfolder {subfolder} not found in downloaded repo {repo_id}")
+        else:
+            model_cache_dir = normalize_cache_path(repo_cache_dir)
+    else:
+        raise ValueError(f"Unsupported model_hub: {model_hub}")
+    # Cache the result before returning
+    with _cache_lock:
+        _model_cache[cache_key] = repo_cache_dir
+    print(f"Model downloaded to: {model_cache_dir}")
+    return repo_cache_dir, model_cache_dir
+def normalize_cache_path(cache_path):
+    """Normalize cache path to ensure consistent format with snapshots/{commit_id}."""
+    # Check if the cache_path directory contains a snapshots folder
+    snapshots_dir = os.path.join(cache_path, "snapshots")
+    if os.path.exists(snapshots_dir) and os.path.isdir(snapshots_dir):
+        # Find the commit_id subdirectory in snapshots
+        try:
+            snapshot_items = os.listdir(snapshots_dir)
+            # Look for the first directory (should be the commit_id)
+            for item in snapshot_items:
+                item_path = os.path.join(snapshots_dir, item)
+                if os.path.isdir(item_path):
+                    # Found commit_id directory, return the full path
+                    return os.path.join(cache_path, "snapshots", item)
+        except OSError:
+            pass
+    # If no snapshots directory found or error occurred, return original path
+    return cache_path

demo/Step-Audio-EditX/funasr_detach/download/file.py ADDED Viewed

	@@ -0,0 +1,335 @@

+# Copyright (c) Alibaba, Inc. and its affiliates.
+import contextlib
+import os
+import tempfile
+from abc import ABCMeta, abstractmethod
+from pathlib import Path
+from typing import Generator, Union
+import requests
+from urllib.parse import urlparse
+def download_from_url(url):
+    result = urlparse(url)
+    file_path = None
+    if result.scheme is not None and len(result.scheme) > 0:
+        storage = HTTPStorage()
+        # bytes
+        data = storage.read(url)
+        work_dir = tempfile.TemporaryDirectory().name
+        if not os.path.exists(work_dir):
+            os.makedirs(work_dir)
+        file_path = os.path.join(work_dir, os.path.basename(url))
+        with open(file_path, "wb") as fb:
+            fb.write(data)
+    assert file_path is not None, f"failed to download: {url}"
+    return file_path
+class Storage(metaclass=ABCMeta):
+    """Abstract class of storage.
+    All backends need to implement two apis: ``read()`` and ``read_text()``.
+    ``read()`` reads the file as a byte stream and ``read_text()`` reads
+    the file as texts.
+    """
+    @abstractmethod
+    def read(self, filepath: str):
+        pass
+    @abstractmethod
+    def read_text(self, filepath: str):
+        pass
+    @abstractmethod
+    def write(self, obj: bytes, filepath: Union[str, Path]) -> None:
+        pass
+    @abstractmethod
+    def write_text(
+        self, obj: str, filepath: Union[str, Path], encoding: str = "utf-8"
+    ) -> None:
+        pass
+class LocalStorage(Storage):
+    """Local hard disk storage"""
+    def read(self, filepath: Union[str, Path]) -> bytes:
+        """Read data from a given ``filepath`` with 'rb' mode.
+        Args:
+            filepath (str or Path): Path to read data.
+        Returns:
+            bytes: Expected bytes object.
+        """
+        with open(filepath, "rb") as f:
+            content = f.read()
+        return content
+    def read_text(self, filepath: Union[str, Path], encoding: str = "utf-8") -> str:
+        """Read data from a given ``filepath`` with 'r' mode.
+        Args:
+            filepath (str or Path): Path to read data.
+            encoding (str): The encoding format used to open the ``filepath``.
+                Default: 'utf-8'.
+        Returns:
+            str: Expected text reading from ``filepath``.
+        """
+        with open(filepath, "r", encoding=encoding) as f:
+            value_buf = f.read()
+        return value_buf
+    def write(self, obj: bytes, filepath: Union[str, Path]) -> None:
+        """Write data to a given ``filepath`` with 'wb' mode.
+        Note:
+            ``write`` will create a directory if the directory of ``filepath``
+            does not exist.
+        Args:
+            obj (bytes): Data to be written.
+            filepath (str or Path): Path to write data.
+        """
+        dirname = os.path.dirname(filepath)
+        if dirname and not os.path.exists(dirname):
+            os.makedirs(dirname, exist_ok=True)
+        with open(filepath, "wb") as f:
+            f.write(obj)
+    def write_text(
+        self, obj: str, filepath: Union[str, Path], encoding: str = "utf-8"
+    ) -> None:
+        """Write data to a given ``filepath`` with 'w' mode.
+        Note:
+            ``write_text`` will create a directory if the directory of
+            ``filepath`` does not exist.
+        Args:
+            obj (str): Data to be written.
+            filepath (str or Path): Path to write data.
+            encoding (str): The encoding format used to open the ``filepath``.
+                Default: 'utf-8'.
+        """
+        dirname = os.path.dirname(filepath)
+        if dirname and not os.path.exists(dirname):
+            os.makedirs(dirname, exist_ok=True)
+        with open(filepath, "w", encoding=encoding) as f:
+            f.write(obj)
+    @contextlib.contextmanager
+    def as_local_path(
+        self, filepath: Union[str, Path]
+    ) -> Generator[Union[str, Path], None, None]:
+        """Only for unified API and do nothing."""
+        yield filepath
+class HTTPStorage(Storage):
+    """HTTP and HTTPS storage."""
+    def read(self, url):
+        # TODO @wenmeng.zwm add progress bar if file is too large
+        r = requests.get(url)
+        r.raise_for_status()
+        return r.content
+    def read_text(self, url):
+        r = requests.get(url)
+        r.raise_for_status()
+        return r.text
+    @contextlib.contextmanager
+    def as_local_path(self, filepath: str) -> Generator[Union[str, Path], None, None]:
+        """Download a file from ``filepath``.
+        ``as_local_path`` is decorated by :meth:`contextlib.contextmanager`. It
+        can be called with ``with`` statement, and when exists from the
+        ``with`` statement, the temporary path will be released.
+        Args:
+            filepath (str): Download a file from ``filepath``.
+        Examples:
+            >>> storage = HTTPStorage()
+            >>> # After existing from the ``with`` clause,
+            >>> # the path will be removed
+            >>> with storage.get_local_path('http://path/to/file') as path:
+            ...     # do something here
+        """
+        try:
+            f = tempfile.NamedTemporaryFile(delete=False)
+            f.write(self.read(filepath))
+            f.close()
+            yield f.name
+        finally:
+            os.remove(f.name)
+    def write(self, obj: bytes, url: Union[str, Path]) -> None:
+        raise NotImplementedError("write is not supported by HTTP Storage")
+    def write_text(
+        self, obj: str, url: Union[str, Path], encoding: str = "utf-8"
+    ) -> None:
+        raise NotImplementedError("write_text is not supported by HTTP Storage")
+class OSSStorage(Storage):
+    """OSS storage."""
+    def __init__(self, oss_config_file=None):
+        # read from config file or env var
+        raise NotImplementedError("OSSStorage.__init__ to be implemented in the future")
+    def read(self, filepath):
+        raise NotImplementedError("OSSStorage.read to be implemented in the future")
+    def read_text(self, filepath, encoding="utf-8"):
+        raise NotImplementedError(
+            "OSSStorage.read_text to be implemented in the future"
+        )
+    @contextlib.contextmanager
+    def as_local_path(self, filepath: str) -> Generator[Union[str, Path], None, None]:
+        """Download a file from ``filepath``.
+        ``as_local_path`` is decorated by :meth:`contextlib.contextmanager`. It
+        can be called with ``with`` statement, and when exists from the
+        ``with`` statement, the temporary path will be released.
+        Args:
+            filepath (str): Download a file from ``filepath``.
+        Examples:
+            >>> storage = OSSStorage()
+            >>> # After existing from the ``with`` clause,
+            >>> # the path will be removed
+            >>> with storage.get_local_path('http://path/to/file') as path:
+            ...     # do something here
+        """
+        try:
+            f = tempfile.NamedTemporaryFile(delete=False)
+            f.write(self.read(filepath))
+            f.close()
+            yield f.name
+        finally:
+            os.remove(f.name)
+    def write(self, obj: bytes, filepath: Union[str, Path]) -> None:
+        raise NotImplementedError("OSSStorage.write to be implemented in the future")
+    def write_text(
+        self, obj: str, filepath: Union[str, Path], encoding: str = "utf-8"
+    ) -> None:
+        raise NotImplementedError(
+            "OSSStorage.write_text to be implemented in the future"
+        )
+G_STORAGES = {}
+class File(object):
+    _prefix_to_storage: dict = {
+        "oss": OSSStorage,
+        "http": HTTPStorage,
+        "https": HTTPStorage,
+        "local": LocalStorage,
+    }
+    @staticmethod
+    def _get_storage(uri):
+        assert isinstance(uri, str), f"uri should be str type, but got {type(uri)}"
+        if "://" not in uri:
+            # local path
+            storage_type = "local"
+        else:
+            prefix, _ = uri.split("://")
+            storage_type = prefix
+        assert storage_type in File._prefix_to_storage, (
+            f"Unsupported uri {uri}, valid prefixs: "
+            f"{list(File._prefix_to_storage.keys())}"
+        )
+        if storage_type not in G_STORAGES:
+            G_STORAGES[storage_type] = File._prefix_to_storage[storage_type]()
+        return G_STORAGES[storage_type]
+    @staticmethod
+    def read(uri: str) -> bytes:
+        """Read data from a given ``filepath`` with 'rb' mode.
+        Args:
+            filepath (str or Path): Path to read data.
+        Returns:
+            bytes: Expected bytes object.
+        """
+        storage = File._get_storage(uri)
+        return storage.read(uri)
+    @staticmethod
+    def read_text(uri: Union[str, Path], encoding: str = "utf-8") -> str:
+        """Read data from a given ``filepath`` with 'r' mode.
+        Args:
+            filepath (str or Path): Path to read data.
+            encoding (str): The encoding format used to open the ``filepath``.
+                Default: 'utf-8'.
+        Returns:
+            str: Expected text reading from ``filepath``.
+        """
+        storage = File._get_storage(uri)
+        return storage.read_text(uri)
+    @staticmethod
+    def write(obj: bytes, uri: Union[str, Path]) -> None:
+        """Write data to a given ``filepath`` with 'wb' mode.
+        Note:
+            ``write`` will create a directory if the directory of ``filepath``
+            does not exist.
+        Args:
+            obj (bytes): Data to be written.
+            filepath (str or Path): Path to write data.
+        """
+        storage = File._get_storage(uri)
+        return storage.write(obj, uri)
+    @staticmethod
+    def write_text(obj: str, uri: str, encoding: str = "utf-8") -> None:
+        """Write data to a given ``filepath`` with 'w' mode.
+        Note:
+            ``write_text`` will create a directory if the directory of
+            ``filepath`` does not exist.
+        Args:
+            obj (str): Data to be written.
+            filepath (str or Path): Path to write data.
+            encoding (str): The encoding format used to open the ``filepath``.
+                Default: 'utf-8'.
+        """
+        storage = File._get_storage(uri)
+        return storage.write_text(obj, uri)
+    @contextlib.contextmanager
+    def as_local_path(uri: str) -> Generator[Union[str, Path], None, None]:
+        """Only for unified API and do nothing."""
+        storage = File._get_storage(uri)
+        with storage.as_local_path(uri) as local_path:
+            yield local_path

demo/Step-Audio-EditX/funasr_detach/download/name_maps_from_hub.py ADDED Viewed

	@@ -0,0 +1,13 @@

+name_maps_ms = {
+    "paraformer-zh": "damo/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
+    "paraformer-en": "damo/speech_paraformer-large-vad-punc_asr_nat-en-16k-common-vocab10020",
+    "paraformer-en-spk": "damo/speech_paraformer-large-vad-punc_asr_nat-en-16k-common-vocab10020",
+    "paraformer-zh-streaming": "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online",
+    "fsmn-vad": "damo/speech_fsmn_vad_zh-cn-16k-common-pytorch",
+    "ct-punc": "damo/punc_ct-transformer_cn-en-common-vocab471067-large",
+    "ct-punc-c": "damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch",
+    "fa-zh": "damo/speech_timestamp_prediction-v1-16k-offline",
+    "cam++": "damo/speech_campplus_sv_zh-cn_16k-common",
+}
+name_maps_hf = {}

demo/Step-Audio-EditX/funasr_detach/download/runtime_sdk_download_tool.py ADDED Viewed

	@@ -0,0 +1,60 @@

+import os
+import argparse
+from pathlib import Path
+from funasr_detach.utils.types import str2bool
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-name", type=str, required=True)
+    parser.add_argument("--export-dir", type=str, required=True)
+    parser.add_argument(
+        "--export", type=str2bool, default=True, help="whether to export model"
+    )
+    parser.add_argument("--type", type=str, default="onnx", help='["onnx", "torch"]')
+    parser.add_argument("--device", type=str, default="cpu", help='["cpu", "cuda"]')
+    parser.add_argument(
+        "--quantize", type=str2bool, default=False, help="export quantized model"
+    )
+    parser.add_argument(
+        "--fallback-num", type=int, default=0, help="amp fallback number"
+    )
+    parser.add_argument("--audio_in", type=str, default=None, help='["wav", "wav.scp"]')
+    parser.add_argument(
+        "--model_revision", type=str, default=None, help="model_revision"
+    )
+    parser.add_argument("--calib_num", type=int, default=200, help="calib max num")
+    args = parser.parse_args()
+    model_dir = args.model_name
+    if not Path(args.model_name).exists():
+        from modelscope.hub.snapshot_download import snapshot_download
+        try:
+            model_dir = snapshot_download(
+                args.model_name, cache_dir=args.export_dir, revision=args.model_revision
+            )
+        except:
+            raise "model_dir must be model_name in modelscope or local path downloaded from modelscope, but is {}".format(
+                model_dir
+            )
+    if args.export:
+        model_file = os.path.join(model_dir, "model.onnx")
+        if args.quantize:
+            model_file = os.path.join(model_dir, "model_quant.onnx")
+        if not os.path.exists(model_file):
+            print(".onnx is not exist, begin to export onnx")
+            from funasr_detach.bin.export_model import ModelExport
+            export_model = ModelExport(
+                cache_dir=args.export_dir,
+                onnx=True,
+                device="cpu",
+                quant=args.quantize,
+            )
+            export_model.export(model_dir)
+if __name__ == "__main__":
+    main()

demo/Step-Audio-EditX/funasr_detach/frontends/__init__.py ADDED Viewed

File without changes