root commited on
Commit
08ce36d
·
0 Parent(s):

Clean init push

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .dockerignore +10 -0
  2. .gitignore +14 -0
  3. .no_build +0 -0
  4. Dockerfile +34 -0
  5. Install.bat +21 -0
  6. Install.sh +18 -0
  7. LICENSE +201 -0
  8. README.md +130 -0
  9. app.py +352 -0
  10. configs/translation.yaml +459 -0
  11. docker-compose.yaml +29 -0
  12. modules/__init__.py +0 -0
  13. modules/diarize/__init__.py +0 -0
  14. modules/diarize/audio_loader.py +179 -0
  15. modules/diarize/diarize_pipeline.py +98 -0
  16. modules/diarize/diarizer.py +145 -0
  17. modules/translation/__init__.py +0 -0
  18. modules/translation/deepl_api.py +217 -0
  19. modules/translation/nllb_inference.py +289 -0
  20. modules/translation/translation_base.py +181 -0
  21. modules/ui/__init__.py +0 -0
  22. modules/ui/htmls.py +97 -0
  23. modules/utils/__init__.py +0 -0
  24. modules/utils/cli_manager.py +12 -0
  25. modules/utils/constants.py +6 -0
  26. modules/utils/files_manager.py +75 -0
  27. modules/utils/paths.py +32 -0
  28. modules/utils/subtitle_manager.py +438 -0
  29. modules/utils/youtube_manager.py +33 -0
  30. modules/uvr/music_separator.py +183 -0
  31. modules/vad/__init__.py +0 -0
  32. modules/vad/silero_vad.py +265 -0
  33. modules/whisper/__init__.py +0 -0
  34. modules/whisper/base_transcription_pipeline.py +563 -0
  35. modules/whisper/data_classes.py +608 -0
  36. modules/whisper/faster_whisper_inference.py +176 -0
  37. modules/whisper/insanely_fast_whisper_inference.py +195 -0
  38. modules/whisper/whisper_Inference.py +111 -0
  39. modules/whisper/whisper_factory.py +84 -0
  40. notebook/whisper-webui.ipynb +134 -0
  41. requirements.txt +18 -0
  42. start-webui.bat +7 -0
  43. start-webui.sh +6 -0
  44. tests/test_bgm_separation.py +53 -0
  45. tests/test_config.py +40 -0
  46. tests/test_diarization.py +31 -0
  47. tests/test_srt.srt +7 -0
  48. tests/test_transcription.py +110 -0
  49. tests/test_translation.py +56 -0
  50. tests/test_vad.py +26 -0
.dockerignore ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ # from .gitignore
2
+ modules/yt_tmp.wav
3
+ **/venv/
4
+ **/__pycache__/
5
+ **/outputs/
6
+ **/models/
7
+
8
+ **/.idea
9
+ **/.git
10
+ **/.github
.gitignore ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.wav
2
+ *.png
3
+ *.mp4
4
+ *.mp3
5
+ .idea/
6
+ .pytest_cache/
7
+ venv/
8
+ modules/ui/__pycache__/
9
+ outputs/
10
+ modules/__pycache__/
11
+ models/
12
+ modules/yt_tmp.wav
13
+ configs/default_parameters.yaml
14
+ __pycache__/
.no_build ADDED
File without changes
Dockerfile ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM debian:bookworm-slim AS builder
2
+
3
+ RUN apt-get update && \
4
+ apt-get install -y curl git python3 python3-pip python3-venv && \
5
+ rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/* && \
6
+ mkdir -p /Whisper-WebUI
7
+
8
+ WORKDIR /Whisper-WebUI
9
+
10
+ COPY requirements.txt .
11
+
12
+ RUN python3 -m venv venv && \
13
+ . venv/bin/activate && \
14
+ pip install --no-cache-dir -r requirements.txt
15
+
16
+
17
+ FROM debian:bookworm-slim AS runtime
18
+
19
+ RUN apt-get update && \
20
+ apt-get install -y curl ffmpeg python3 && \
21
+ rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*
22
+
23
+ WORKDIR /Whisper-WebUI
24
+
25
+ COPY . .
26
+ COPY --from=builder /Whisper-WebUI/venv /Whisper-WebUI/venv
27
+
28
+ VOLUME [ "/Whisper-WebUI/models" ]
29
+ VOLUME [ "/Whisper-WebUI/outputs" ]
30
+
31
+ ENV PATH="/Whisper-WebUI/venv/bin:$PATH"
32
+ ENV LD_LIBRARY_PATH=/Whisper-WebUI/venv/lib64/python3.11/site-packages/nvidia/cublas/lib:/Whisper-WebUI/venv/lib64/python3.11/site-packages/nvidia/cudnn/lib
33
+
34
+ ENTRYPOINT [ "python", "app.py" ]
Install.bat ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ @echo off
2
+
3
+ if not exist "%~dp0\venv\Scripts" (
4
+ echo Creating venv...
5
+ python -m venv venv
6
+ )
7
+ echo checked the venv folder. now installing requirements..
8
+
9
+ call "%~dp0\venv\scripts\activate"
10
+
11
+ python -m pip install -U pip
12
+ pip install -r requirements.txt
13
+
14
+ if errorlevel 1 (
15
+ echo.
16
+ echo Requirements installation failed. please remove venv folder and run install.bat again.
17
+ ) else (
18
+ echo.
19
+ echo Requirements installed successfully.
20
+ )
21
+ pause
Install.sh ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ if [ ! -d "venv" ]; then
4
+ echo "Creating virtual environment..."
5
+ python -m venv venv
6
+ fi
7
+
8
+ source venv/bin/activate
9
+
10
+ python -m pip install -U pip
11
+ pip install -r requirements.txt && echo "Requirements installed successfully." || {
12
+ echo ""
13
+ echo "Requirements installation failed. Please remove the venv folder and run the script again."
14
+ deactivate
15
+ exit 1
16
+ }
17
+
18
+ deactivate
LICENSE ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright 2023 jhj0517
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
README.md ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Whisper-WebUI
3
+ emoji: 🚀
4
+ colorFrom: green
5
+ colorTo: blue
6
+ sdk: gradio
7
+ sdk_version: 4.37.2
8
+ app_file: app.py
9
+ pinned: false
10
+ license: apache-2.0
11
+ ---
12
+
13
+ # Whisper-WebUI
14
+ A Gradio-based browser interface for [Whisper](https://github.com/openai/whisper). You can use it as an Easy Subtitle Generator!
15
+
16
+ ![Whisper WebUI](https://github.com/jhj0517/Whsiper-WebUI/blob/master/screenshot.png)
17
+
18
+ ## Notebook
19
+ If you wish to try this on Colab, you can do it in [here](https://colab.research.google.com/github/jhj0517/Whisper-WebUI/blob/master/notebook/whisper-webui.ipynb)!
20
+
21
+ # Feature
22
+ - Select the Whisper implementation you want to use between :
23
+ - [openai/whisper](https://github.com/openai/whisper)
24
+ - [SYSTRAN/faster-whisper](https://github.com/SYSTRAN/faster-whisper) (used by default)
25
+ - [insanely-fast-whisper](https://github.com/Vaibhavs10/insanely-fast-whisper)
26
+ - Generate subtitles from various sources, including :
27
+ - Files
28
+ - Youtube
29
+ - Microphone
30
+ - Currently supported subtitle formats :
31
+ - SRT
32
+ - WebVTT
33
+ - txt ( only text file without timeline )
34
+ - Speech to Text Translation
35
+ - From other languages to English. ( This is Whisper's end-to-end speech-to-text translation feature )
36
+ - Text to Text Translation
37
+ - Translate subtitle files using Facebook NLLB models
38
+ - Translate subtitle files using DeepL API
39
+ - Pre-processing audio input with [Silero VAD](https://github.com/snakers4/silero-vad).
40
+ - Post-processing with speaker diarization using the [pyannote](https://huggingface.co/pyannote/speaker-diarization-3.1) model.
41
+ - To download the pyannote model, you need to have a Huggingface token and manually accept their terms in the pages below.
42
+ 1. https://huggingface.co/pyannote/speaker-diarization-3.1
43
+ 2. https://huggingface.co/pyannote/segmentation-3.0
44
+
45
+ # Installation and Running
46
+ ### Prerequisite
47
+ To run this WebUI, you need to have `git`, `python` version 3.8 ~ 3.10, `FFmpeg` and `CUDA` (if you use NVIDIA GPU) version above 12.0
48
+
49
+ Please follow the links below to install the necessary software:
50
+ - git : [https://git-scm.com/downloads](https://git-scm.com/downloads)
51
+ - python : [https://www.python.org/downloads/](https://www.python.org/downloads/) **( If your python version is too new, torch will not install properly.)**
52
+ - FFmpeg : [https://ffmpeg.org/download.html](https://ffmpeg.org/download.html)
53
+ - CUDA : [https://developer.nvidia.com/cuda-downloads](https://developer.nvidia.com/cuda-downloads)
54
+
55
+ After installing FFmpeg, **make sure to add the `FFmpeg/bin` folder to your system PATH!**
56
+
57
+ ### Automatic Installation
58
+
59
+ 1. Download `Whisper-WebUI.zip` with the file corresponding to your OS from [v1.0.0](https://github.com/jhj0517/Whisper-WebUI/releases/tag/v1.0.0) and extract its contents.
60
+ 2. Run `install.bat` or `install.sh` to install dependencies. (This will create a `venv` directory and install dependencies there.)
61
+ 3. Start WebUI with `start-webui.bat` or `start-webui.sh`
62
+ 4. To update the WebUI, run `update.bat` or `update.sh`
63
+
64
+ And you can also run the project with command line arguments if you like by running `start-webui.bat`, see [wiki](https://github.com/jhj0517/Whisper-WebUI/wiki/Command-Line-Arguments) for a guide to arguments.
65
+
66
+ - ## Running with Docker
67
+
68
+ 1. Build the image
69
+
70
+ ```sh
71
+ docker build -t whisper-webui:latest .
72
+ ```
73
+
74
+ 2. Run the container with commands
75
+
76
+ - For bash :
77
+ ```sh
78
+ docker run --gpus all -d \
79
+ -v /path/to/models:/Whisper-WebUI/models \
80
+ -v /path/to/outputs:/Whisper-WebUI/outputs \
81
+ -p 7860:7860 \
82
+ -it \
83
+ whisper-webui:latest --server_name 0.0.0.0 --server_port 7860
84
+ ```
85
+ - For PowerShell:
86
+ ```shell
87
+ docker run --gpus all -d `
88
+ -v /path/to/models:/Whisper-WebUI/models `
89
+ -v /path/to/outputs:/Whisper-WebUI/outputs `
90
+ -p 7860:7860 `
91
+ -it `
92
+ whisper-webui:latest --server_name 0.0.0.0 --server_port 7860
93
+ ```
94
+
95
+ # VRAM Usages
96
+ This project is integrated with [faster-whisper](https://github.com/guillaumekln/faster-whisper) by default for better VRAM usage and transcription speed.
97
+
98
+ According to faster-whisper, the efficiency of the optimized whisper model is as follows:
99
+ | Implementation | Precision | Beam size | Time | Max. GPU memory | Max. CPU memory |
100
+ |-------------------|-----------|-----------|-------|-----------------|-----------------|
101
+ | openai/whisper | fp16 | 5 | 4m30s | 11325MB | 9439MB |
102
+ | faster-whisper | fp16 | 5 | 54s | 4755MB | 3244MB |
103
+
104
+ If you want to use an implementation other than faster-whisper, use `--whisper_type` arg and the repository name.<br>
105
+ Read [wiki](https://github.com/jhj0517/Whisper-WebUI/wiki/Command-Line-Arguments) for more info about CLI args.
106
+
107
+ ## Available models
108
+ This is Whisper's original VRAM usage table for models.
109
+
110
+ | Size | Parameters | English-only model | Multilingual model | Required VRAM | Relative speed |
111
+ |:------:|:----------:|:------------------:|:------------------:|:-------------:|:--------------:|
112
+ | tiny | 39 M | `tiny.en` | `tiny` | ~1 GB | ~32x |
113
+ | base | 74 M | `base.en` | `base` | ~1 GB | ~16x |
114
+ | small | 244 M | `small.en` | `small` | ~2 GB | ~6x |
115
+ | medium | 769 M | `medium.en` | `medium` | ~5 GB | ~2x |
116
+ | large | 1550 M | N/A | `large` | ~10 GB | 1x |
117
+
118
+
119
+ `.en` models are for English only, and the cool thing is that you can use the `Translate to English` option from the "large" models!
120
+
121
+ ## TODO🗓
122
+
123
+ - [x] Add DeepL API translation
124
+ - [x] Add NLLB Model translation
125
+ - [x] Integrate with faster-whisper
126
+ - [x] Integrate with insanely-fast-whisper
127
+ - [x] Integrate with whisperX ( Only speaker diarization part )
128
+ - [ ] Add fast api script
129
+
130
+
app.py ADDED
@@ -0,0 +1,352 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import argparse
3
+ import gradio as gr
4
+ from gradio_i18n import Translate, gettext as _
5
+ import yaml
6
+
7
+ from modules.utils.paths import (FASTER_WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR, WHISPER_MODELS_DIR,
8
+ INSANELY_FAST_WHISPER_MODELS_DIR, NLLB_MODELS_DIR, DEFAULT_PARAMETERS_CONFIG_PATH,
9
+ UVR_MODELS_DIR, I18N_YAML_PATH)
10
+ from modules.utils.files_manager import load_yaml
11
+ from modules.whisper.whisper_factory import WhisperFactory
12
+ from modules.translation.nllb_inference import NLLBInference
13
+ from modules.ui.htmls import *
14
+ from modules.utils.cli_manager import str2bool
15
+ from modules.utils.youtube_manager import get_ytmetas
16
+ from modules.translation.deepl_api import DeepLAPI
17
+ from modules.whisper.data_classes import *
18
+
19
+
20
+ class App:
21
+ def __init__(self, args):
22
+ self.args = args
23
+ self.app = gr.Blocks(css=CSS, theme=self.args.theme, delete_cache=(60, 3600))
24
+ self.i18n = Translate(I18N_YAML_PATH)
25
+ self.whisper_inf = WhisperFactory.create_whisper_inference(
26
+ whisper_type=self.args.whisper_type,
27
+ whisper_model_dir=self.args.whisper_model_dir,
28
+ faster_whisper_model_dir=self.args.faster_whisper_model_dir,
29
+ insanely_fast_whisper_model_dir=self.args.insanely_fast_whisper_model_dir,
30
+ uvr_model_dir=self.args.uvr_model_dir,
31
+ output_dir=self.args.output_dir,
32
+ )
33
+ self.nllb_inf = NLLBInference(
34
+ model_dir=self.args.nllb_model_dir,
35
+ output_dir=os.path.join(self.args.output_dir, "translations")
36
+ )
37
+ self.deepl_api = DeepLAPI(
38
+ output_dir=os.path.join(self.args.output_dir, "translations")
39
+ )
40
+ self.default_params = load_yaml(DEFAULT_PARAMETERS_CONFIG_PATH)
41
+ print(f"Use \"{self.args.whisper_type}\" implementation\n"
42
+ f"Device \"{self.whisper_inf.device}\" is detected")
43
+
44
+ def create_pipeline_inputs(self):
45
+ whisper_params = self.default_params["whisper"]
46
+ vad_params = self.default_params["vad"]
47
+ diarization_params = self.default_params["diarization"]
48
+ uvr_params = self.default_params["bgm_separation"]
49
+
50
+ with gr.Row():
51
+ dd_model = gr.Dropdown(choices=self.whisper_inf.available_models, value=whisper_params["model_size"],
52
+ label=_("Model"))
53
+ dd_lang = gr.Dropdown(choices=self.whisper_inf.available_langs + [AUTOMATIC_DETECTION],
54
+ value=AUTOMATIC_DETECTION if whisper_params["lang"] == AUTOMATIC_DETECTION.unwrap()
55
+ else whisper_params["lang"], label=_("Language"))
56
+ dd_file_format = gr.Dropdown(choices=["SRT", "WebVTT", "txt", "LRC"], value=whisper_params["file_format"], label=_("File Format"))
57
+ with gr.Row():
58
+ cb_translate = gr.Checkbox(value=whisper_params["is_translate"], label=_("Translate to English?"),
59
+ interactive=True)
60
+ with gr.Row():
61
+ cb_timestamp = gr.Checkbox(value=whisper_params["add_timestamp"],
62
+ label=_("Add a timestamp to the end of the filename"),
63
+ interactive=True)
64
+
65
+ with gr.Accordion(_("Advanced Parameters"), open=False):
66
+ whisper_inputs = WhisperParams.to_gradio_inputs(defaults=whisper_params, only_advanced=True,
67
+ whisper_type=self.args.whisper_type,
68
+ available_compute_types=self.whisper_inf.available_compute_types,
69
+ compute_type=self.whisper_inf.current_compute_type)
70
+
71
+ with gr.Accordion(_("Background Music Remover Filter"), open=False):
72
+ uvr_inputs = BGMSeparationParams.to_gradio_input(defaults=uvr_params,
73
+ available_models=self.whisper_inf.music_separator.available_models,
74
+ available_devices=self.whisper_inf.music_separator.available_devices,
75
+ device=self.whisper_inf.music_separator.device)
76
+
77
+ with gr.Accordion(_("Voice Detection Filter"), open=False):
78
+ vad_inputs = VadParams.to_gradio_inputs(defaults=vad_params)
79
+
80
+ with gr.Accordion(_("Diarization"), open=False):
81
+ diarization_inputs = DiarizationParams.to_gradio_inputs(defaults=diarization_params,
82
+ available_devices=self.whisper_inf.diarizer.available_device,
83
+ device=self.whisper_inf.diarizer.device)
84
+
85
+ dd_model.change(fn=self.on_change_models, inputs=[dd_model], outputs=[cb_translate])
86
+
87
+ pipeline_inputs = [dd_model, dd_lang, cb_translate] + whisper_inputs + vad_inputs + diarization_inputs + uvr_inputs
88
+
89
+ return (
90
+ pipeline_inputs,
91
+ dd_file_format,
92
+ cb_timestamp
93
+ )
94
+
95
+ def launch(self):
96
+ translation_params = self.default_params["translation"]
97
+ deepl_params = translation_params["deepl"]
98
+ nllb_params = translation_params["nllb"]
99
+ uvr_params = self.default_params["bgm_separation"]
100
+
101
+ with self.app:
102
+ with self.i18n:
103
+ with gr.Row():
104
+ with gr.Column():
105
+ gr.Markdown(MARKDOWN, elem_id="md_project")
106
+ with gr.Tabs():
107
+ with gr.TabItem(_("File")): # tab1
108
+ with gr.Column():
109
+ input_file = gr.Files(type="filepath", label=_("Upload File here"))
110
+ tb_input_folder = gr.Textbox(label="Input Folder Path (Optional)",
111
+ info="Optional: Specify the folder path where the input files are located, if you prefer to use local files instead of uploading them."
112
+ " Leave this field empty if you do not wish to use a local path.",
113
+ visible=self.args.colab,
114
+ value="")
115
+
116
+ pipeline_params, dd_file_format, cb_timestamp = self.create_pipeline_inputs()
117
+
118
+ with gr.Row():
119
+ btn_run = gr.Button(_("GENERATE SUBTITLE FILE"), variant="primary")
120
+ with gr.Row():
121
+ tb_indicator = gr.Textbox(label=_("Output"), scale=5)
122
+ files_subtitles = gr.Files(label=_("Downloadable output file"), scale=3, interactive=False)
123
+ btn_openfolder = gr.Button('📂', scale=1)
124
+
125
+ params = [input_file, tb_input_folder, dd_file_format, cb_timestamp]
126
+ btn_run.click(fn=self.whisper_inf.transcribe_file,
127
+ inputs=params + pipeline_params,
128
+ outputs=[tb_indicator, files_subtitles])
129
+ btn_openfolder.click(fn=lambda: self.open_folder("outputs"), inputs=None, outputs=None)
130
+
131
+ with gr.TabItem(_("Youtube")): # tab2
132
+ with gr.Row():
133
+ tb_youtubelink = gr.Textbox(label=_("Youtube Link"))
134
+ with gr.Row(equal_height=True):
135
+ with gr.Column():
136
+ img_thumbnail = gr.Image(label=_("Youtube Thumbnail"))
137
+ with gr.Column():
138
+ tb_title = gr.Label(label=_("Youtube Title"))
139
+ tb_description = gr.Textbox(label=_("Youtube Description"), max_lines=15)
140
+
141
+ pipeline_params, dd_file_format, cb_timestamp = self.create_pipeline_inputs()
142
+
143
+ with gr.Row():
144
+ btn_run = gr.Button(_("GENERATE SUBTITLE FILE"), variant="primary")
145
+ with gr.Row():
146
+ tb_indicator = gr.Textbox(label=_("Output"), scale=5)
147
+ files_subtitles = gr.Files(label=_("Downloadable output file"), scale=3)
148
+ btn_openfolder = gr.Button('📂', scale=1)
149
+
150
+ params = [tb_youtubelink, dd_file_format, cb_timestamp]
151
+
152
+ btn_run.click(fn=self.whisper_inf.transcribe_youtube,
153
+ inputs=params + pipeline_params,
154
+ outputs=[tb_indicator, files_subtitles])
155
+ tb_youtubelink.change(get_ytmetas, inputs=[tb_youtubelink],
156
+ outputs=[img_thumbnail, tb_title, tb_description])
157
+ btn_openfolder.click(fn=lambda: self.open_folder("outputs"), inputs=None, outputs=None)
158
+
159
+ with gr.TabItem(_("Mic")): # tab3
160
+ with gr.Row():
161
+ mic_input = gr.Microphone(label=_("Record with Mic"), type="filepath", interactive=True)
162
+
163
+ pipeline_params, dd_file_format, cb_timestamp = self.create_pipeline_inputs()
164
+
165
+ with gr.Row():
166
+ btn_run = gr.Button(_("GENERATE SUBTITLE FILE"), variant="primary")
167
+ with gr.Row():
168
+ tb_indicator = gr.Textbox(label=_("Output"), scale=5)
169
+ files_subtitles = gr.Files(label=_("Downloadable output file"), scale=3)
170
+ btn_openfolder = gr.Button('📂', scale=1)
171
+
172
+ params = [mic_input, dd_file_format, cb_timestamp]
173
+
174
+ btn_run.click(fn=self.whisper_inf.transcribe_mic,
175
+ inputs=params + pipeline_params,
176
+ outputs=[tb_indicator, files_subtitles])
177
+ btn_openfolder.click(fn=lambda: self.open_folder("outputs"), inputs=None, outputs=None)
178
+
179
+ with gr.TabItem(_("T2T Translation")): # tab 4
180
+ with gr.Row():
181
+ file_subs = gr.Files(type="filepath", label=_("Upload Subtitle Files to translate here"))
182
+
183
+ with gr.TabItem(_("DeepL API")): # sub tab1
184
+ with gr.Row():
185
+ tb_api_key = gr.Textbox(label=_("Your Auth Key (API KEY)"),
186
+ value=deepl_params["api_key"])
187
+ with gr.Row():
188
+ dd_source_lang = gr.Dropdown(label=_("Source Language"),
189
+ value=AUTOMATIC_DETECTION if deepl_params["source_lang"] == AUTOMATIC_DETECTION.unwrap()
190
+ else deepl_params["source_lang"],
191
+ choices=list(self.deepl_api.available_source_langs.keys()))
192
+ dd_target_lang = gr.Dropdown(label=_("Target Language"),
193
+ value=deepl_params["target_lang"],
194
+ choices=list(self.deepl_api.available_target_langs.keys()))
195
+ with gr.Row():
196
+ cb_is_pro = gr.Checkbox(label=_("Pro User?"), value=deepl_params["is_pro"])
197
+ with gr.Row():
198
+ cb_timestamp = gr.Checkbox(value=translation_params["add_timestamp"],
199
+ label=_("Add a timestamp to the end of the filename"),
200
+ interactive=True)
201
+ with gr.Row():
202
+ btn_run = gr.Button(_("TRANSLATE SUBTITLE FILE"), variant="primary")
203
+ with gr.Row():
204
+ tb_indicator = gr.Textbox(label=_("Output"), scale=5)
205
+ files_subtitles = gr.Files(label=_("Downloadable output file"), scale=3)
206
+ btn_openfolder = gr.Button('📂', scale=1)
207
+
208
+ btn_run.click(fn=self.deepl_api.translate_deepl,
209
+ inputs=[tb_api_key, file_subs, dd_source_lang, dd_target_lang,
210
+ cb_is_pro, cb_timestamp],
211
+ outputs=[tb_indicator, files_subtitles])
212
+
213
+ btn_openfolder.click(
214
+ fn=lambda: self.open_folder(os.path.join(self.args.output_dir, "translations")),
215
+ inputs=None,
216
+ outputs=None)
217
+
218
+ with gr.TabItem(_("NLLB")): # sub tab2
219
+ with gr.Row():
220
+ dd_model_size = gr.Dropdown(label=_("Model"), value=nllb_params["model_size"],
221
+ choices=self.nllb_inf.available_models)
222
+ dd_source_lang = gr.Dropdown(label=_("Source Language"),
223
+ value=nllb_params["source_lang"],
224
+ choices=self.nllb_inf.available_source_langs)
225
+ dd_target_lang = gr.Dropdown(label=_("Target Language"),
226
+ value=nllb_params["target_lang"],
227
+ choices=self.nllb_inf.available_target_langs)
228
+ with gr.Row():
229
+ nb_max_length = gr.Number(label="Max Length Per Line", value=nllb_params["max_length"],
230
+ precision=0)
231
+ with gr.Row():
232
+ cb_timestamp = gr.Checkbox(value=translation_params["add_timestamp"],
233
+ label=_("Add a timestamp to the end of the filename"),
234
+ interactive=True)
235
+ with gr.Row():
236
+ btn_run = gr.Button(_("TRANSLATE SUBTITLE FILE"), variant="primary")
237
+ with gr.Row():
238
+ tb_indicator = gr.Textbox(label=_("Output"), scale=5)
239
+ files_subtitles = gr.Files(label=_("Downloadable output file"), scale=3)
240
+ btn_openfolder = gr.Button('📂', scale=1)
241
+ with gr.Column():
242
+ md_vram_table = gr.HTML(NLLB_VRAM_TABLE, elem_id="md_nllb_vram_table")
243
+
244
+ btn_run.click(fn=self.nllb_inf.translate_file,
245
+ inputs=[file_subs, dd_model_size, dd_source_lang, dd_target_lang,
246
+ nb_max_length, cb_timestamp],
247
+ outputs=[tb_indicator, files_subtitles])
248
+
249
+ btn_openfolder.click(
250
+ fn=lambda: self.open_folder(os.path.join(self.args.output_dir, "translations")),
251
+ inputs=None,
252
+ outputs=None)
253
+
254
+ with gr.TabItem(_("BGM Separation")):
255
+ files_audio = gr.Files(type="filepath", label=_("Upload Audio Files to separate background music"))
256
+ dd_uvr_device = gr.Dropdown(label=_("Device"), value=self.whisper_inf.music_separator.device,
257
+ choices=self.whisper_inf.music_separator.available_devices)
258
+ dd_uvr_model_size = gr.Dropdown(label=_("Model"), value=uvr_params["model_size"],
259
+ choices=self.whisper_inf.music_separator.available_models)
260
+ nb_uvr_segment_size = gr.Number(label="Segment Size", value=uvr_params["segment_size"],
261
+ precision=0)
262
+ cb_uvr_save_file = gr.Checkbox(label=_("Save separated files to output"),
263
+ value=True, visible=False)
264
+ btn_run = gr.Button(_("SEPARATE BACKGROUND MUSIC"), variant="primary")
265
+ with gr.Column():
266
+ with gr.Row():
267
+ ad_instrumental = gr.Audio(label=_("Instrumental"), scale=8)
268
+ btn_open_instrumental_folder = gr.Button('📂', scale=1)
269
+ with gr.Row():
270
+ ad_vocals = gr.Audio(label=_("Vocals"), scale=8)
271
+ btn_open_vocals_folder = gr.Button('📂', scale=1)
272
+
273
+ btn_run.click(fn=self.whisper_inf.music_separator.separate_files,
274
+ inputs=[files_audio, dd_uvr_model_size, dd_uvr_device, nb_uvr_segment_size,
275
+ cb_uvr_save_file],
276
+ outputs=[ad_instrumental, ad_vocals])
277
+ btn_open_instrumental_folder.click(inputs=None,
278
+ outputs=None,
279
+ fn=lambda: self.open_folder(os.path.join(
280
+ self.args.output_dir, "UVR", "instrumental"
281
+ )))
282
+ btn_open_vocals_folder.click(inputs=None,
283
+ outputs=None,
284
+ fn=lambda: self.open_folder(os.path.join(
285
+ self.args.output_dir, "UVR", "vocals"
286
+ )))
287
+
288
+ # Launch the app with optional gradio settings
289
+ args = self.args
290
+ self.app.queue(
291
+ api_open=args.api_open
292
+ ).launch(
293
+ share=args.share,
294
+ server_name=args.server_name,
295
+ server_port=args.server_port,
296
+ auth=(args.username, args.password) if args.username and args.password else None,
297
+ root_path=args.root_path,
298
+ inbrowser=args.inbrowser
299
+ )
300
+
301
+ @staticmethod
302
+ def open_folder(folder_path: str):
303
+ if os.path.exists(folder_path):
304
+ os.system(f"start {folder_path}")
305
+ else:
306
+ os.makedirs(folder_path, exist_ok=True)
307
+ print(f"The directory path {folder_path} has newly created.")
308
+
309
+ @staticmethod
310
+ def on_change_models(model_size: str):
311
+ translatable_model = ["large", "large-v1", "large-v2", "large-v3"]
312
+ if model_size not in translatable_model:
313
+ return gr.Checkbox(visible=False, value=False, interactive=False)
314
+ else:
315
+ return gr.Checkbox(visible=True, value=False, label="Translate to English?", interactive=True)
316
+
317
+
318
+ parser = argparse.ArgumentParser()
319
+ parser.add_argument('--whisper_type', type=str, default=WhisperImpl.FASTER_WHISPER.value,
320
+ choices=[item.value for item in WhisperImpl],
321
+ help='A type of the whisper implementation (Github repo name)')
322
+ parser.add_argument('--share', type=str2bool, default=False, nargs='?', const=True, help='Gradio share value')
323
+ parser.add_argument('--server_name', type=str, default=None, help='Gradio server host')
324
+ parser.add_argument('--server_port', type=int, default=None, help='Gradio server port')
325
+ parser.add_argument('--root_path', type=str, default=None, help='Gradio root path')
326
+ parser.add_argument('--username', type=str, default=None, help='Gradio authentication username')
327
+ parser.add_argument('--password', type=str, default=None, help='Gradio authentication password')
328
+ parser.add_argument('--theme', type=str, default=None, help='Gradio Blocks theme')
329
+ parser.add_argument('--colab', type=str2bool, default=False, nargs='?', const=True, help='Is colab user or not')
330
+ parser.add_argument('--api_open', type=str2bool, default=False, nargs='?', const=True,
331
+ help='Enable api or not in Gradio')
332
+ parser.add_argument('--inbrowser', type=str2bool, default=True, nargs='?', const=True,
333
+ help='Whether to automatically start Gradio app or not')
334
+ parser.add_argument('--whisper_model_dir', type=str, default=WHISPER_MODELS_DIR,
335
+ help='Directory path of the whisper model')
336
+ parser.add_argument('--faster_whisper_model_dir', type=str, default=FASTER_WHISPER_MODELS_DIR,
337
+ help='Directory path of the faster-whisper model')
338
+ parser.add_argument('--insanely_fast_whisper_model_dir', type=str,
339
+ default=INSANELY_FAST_WHISPER_MODELS_DIR,
340
+ help='Directory path of the insanely-fast-whisper model')
341
+ parser.add_argument('--diarization_model_dir', type=str, default=DIARIZATION_MODELS_DIR,
342
+ help='Directory path of the diarization model')
343
+ parser.add_argument('--nllb_model_dir', type=str, default=NLLB_MODELS_DIR,
344
+ help='Directory path of the Facebook NLLB model')
345
+ parser.add_argument('--uvr_model_dir', type=str, default=UVR_MODELS_DIR,
346
+ help='Directory path of the UVR model')
347
+ parser.add_argument('--output_dir', type=str, default=OUTPUT_DIR, help='Directory path of the outputs')
348
+ _args = parser.parse_args()
349
+
350
+ if __name__ == "__main__":
351
+ app = App(args=_args)
352
+ app.launch(share=True)
configs/translation.yaml ADDED
@@ -0,0 +1,459 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ en: # English
2
+ Language: Language
3
+ File: File
4
+ Youtube: Youtube
5
+ Mic: Mic
6
+ T2T Translation: T2T Translation
7
+ BGM Separation: BGM Separation
8
+ GENERATE SUBTITLE FILE: GENERATE SUBTITLE FILE
9
+ Output: Output
10
+ Downloadable output file: Downloadable output file
11
+ Upload File here: Upload File here
12
+ Model: Model
13
+ Automatic Detection: Automatic Detection
14
+ File Format: File Format
15
+ Translate to English?: Translate to English?
16
+ Add a timestamp to the end of the filename: Add a timestamp to the end of the filename
17
+ Advanced Parameters: Advanced Parameters
18
+ Background Music Remover Filter: Background Music Remover Filter
19
+ Enabling this will remove background music: Enabling this will remove background music by submodel before transcribing
20
+ Enable Background Music Remover Filter: Enable Background Music Remover Filter
21
+ Save separated files to output: Save separated files to output
22
+ Offload sub model after removing background music: Offload sub model after removing background music
23
+ Voice Detection Filter: Voice Detection Filter
24
+ Enable this to transcribe only detected voice: Enable this to transcribe only detected voice parts by submodel.
25
+ Enable Silero VAD Filter: Enable Silero VAD Filter
26
+ Diarization: Diarization
27
+ Enable Diarization: Enable Diarization
28
+ HuggingFace Token: HuggingFace Token
29
+ This is only needed the first time you download the model: This is only needed the first time you download the model. If you already have models, you don't need to enter. To download the model, you must manually go to "https://huggingface.co/pyannote/speaker-diarization-3.1" and "https://huggingface.co/pyannote/segmentation-3.0" and agree to their requirement.
30
+ Device: Device
31
+ Youtube Link: Youtube Link
32
+ Youtube Thumbnail: Youtube Thumbnail
33
+ Youtube Title: Youtube Title
34
+ Youtube Description: Youtube Description
35
+ Record with Mic: Record with Mic
36
+ Upload Subtitle Files to translate here: Upload Subtitle Files to translate here
37
+ Your Auth Key (API KEY): Your Auth Key (API KEY)
38
+ Source Language: Source Language
39
+ Target Language: Target Language
40
+ Pro User?: Pro User?
41
+ TRANSLATE SUBTITLE FILE: TRANSLATE SUBTITLE FILE
42
+ Upload Audio Files to separate background music: Upload Audio Files to separate background music
43
+ Instrumental: Instrumental
44
+ Vocals: Vocals
45
+ SEPARATE BACKGROUND MUSIC: SEPARATE BACKGROUND MUSIC
46
+
47
+ ko: # Korean
48
+ Language: 언어
49
+ File: 파일
50
+ Youtube: 유튜브
51
+ Mic: 마이크
52
+ T2T Translation: T2T 자막 번역
53
+ BGM Separation: 배경 음악 분리
54
+ GENERATE SUBTITLE FILE: 자막 파일 생성
55
+ Output: 결과물
56
+ Downloadable output file: 결과물 파일 다운로드
57
+ Upload File here: 파일을 업로드 하세요
58
+ Model: 모델
59
+ Automatic Detection: 자동 감지
60
+ File Format: 파일 형식
61
+ Translate to English?: 영어로 번역합니까? (위스퍼 모델 자체 번역 기능)
62
+ Add a timestamp to the end of the filename: 파일 이름 끝에 타임스태프 붙이기
63
+ Advanced Parameters: 고급 변수
64
+ Background Music Remover Filter: 배경 음악 제거 필터
65
+ Enabling this will remove background music: 받아쓰기 이전에 먼저 배경 음악 제거용 서브 모델을 활성화 합니다.
66
+ Enable Background Music Remover Filter: 배경 음악 제거 필터 활성화
67
+ Save separated files to output: 분리된 배경 음악 & 음성 파일 따로 출력 폴더에 저장
68
+ Offload sub model after removing background music: 배경 음악 제거 후 서브 모델을 비활성화 합니다. (VRAM 이 부족할 시 체크하세요.)
69
+ Voice Detection Filter: 목소리 감지 필터
70
+ Enable this to transcribe only detected voice: 서브 모델에 의해 목소리라고 판단된 부분만 받아쓰기를 진행합니다.
71
+ Enable Silero VAD Filter: Silero VAD 필터 활성화
72
+ Diarization: 화자 구분
73
+ Enable Diarization: 화자 구분 활성화
74
+ HuggingFace Token: 허깅페이스 토큰
75
+ This is only needed the first time you download the model: 모델을 처음 다운받을 때만 토큰이 필요합니다. 이미 다운로드 받으신 상태라면 입력하지 않아도 됩니다. 모델을 다운 받기 위해선 "https://huggingface.co/pyannote/speaker-diarization-3.1" 와 "https://huggingface.co/pyannote/segmentation-3.0" 에서 먼저 사용 지침에 동의하셔야 합니다.
76
+ Device: 디바이스
77
+ Youtube Link: 유튜브 링크
78
+ Youtube Thumbnail: 유튜브 썸네일
79
+ Youtube Title: 유튜브 제목
80
+ Youtube Description: 유튜브 설명
81
+ Record with Mic: 마이크로 녹음하세요
82
+ Upload Subtitle Files to translate here: 번역할 자막 파일을 업로드 하세요
83
+ Your Auth Key (API KEY): DeepL API 키
84
+ Source Language: 원본 언어
85
+ Target Language: 대상 언어
86
+ Pro User?: Pro 버전 사용자
87
+ TRANSLATE SUBTITLE FILE: 자막 파일 번역
88
+ Upload Audio Files to separate background music: 배경 음악을 분리할 오디오 파일을 업로드 하세요
89
+ Instrumental: 악기
90
+ Vocals: 보컬
91
+ SEPARATE BACKGROUND MUSIC: 배경 음악 분리
92
+
93
+ ja: # Japanese
94
+ Language: 言語
95
+ File: File
96
+ Youtube: Youtube
97
+ Mic: Mic
98
+ T2T Translation: T2T Translation
99
+ BGM Separation: BGM Separation
100
+ GENERATE SUBTITLE FILE: GENERATE SUBTITLE FILE
101
+ Output: Output
102
+ Downloadable output file: Downloadable output file
103
+ Upload File here: Upload File here
104
+ Model: Model
105
+ Automatic Detection: Automatic Detection
106
+ File Format: File Format
107
+ Translate to English?: Translate to English?
108
+ Add a timestamp to the end of the filename: Add a timestamp to the end of the filename
109
+ Advanced Parameters: Advanced Parameters
110
+ Background Music Remover Filter: Background Music Remover Filter
111
+ Enabling this will remove background music: Enabling this will remove background music by submodel before transcribing
112
+ Enable Background Music Remover Filter: Enable Background Music Remover Filter
113
+ Save separated files to output: Save separated files to output
114
+ Offload sub model after removing background music: Offload sub model after removing background music
115
+ Voice Detection Filter: Voice Detection Filter
116
+ Enable this to transcribe only detected voice: Enable this to transcribe only detected voice parts by submodel.
117
+ Enable Silero VAD Filter: Enable Silero VAD Filter
118
+ Diarization: Diarization
119
+ Enable Diarization: Enable Diarization
120
+ HuggingFace Token: HuggingFace Token
121
+ This is only needed the first time you download the model: This is only needed the first time you download the model. If you already have models, you don't need to enter. To download the model, you must manually go to "https://huggingface.co/pyannote/speaker-diarization-3.1" and "https://huggingface.co/pyannote/segmentation-3.0" and agree to their requirement.
122
+ Device: Device
123
+ Youtube Link: Youtube Link
124
+ Youtube Thumbnail: Youtube Thumbnail
125
+ Youtube Title: Youtube Title
126
+ Youtube Description: Youtube Description
127
+ Record with Mic: Record with Mic
128
+ Upload Subtitle Files to translate here: Upload Subtitle Files to translate here
129
+ Your Auth Key (API KEY): Your Auth Key (API KEY)
130
+ Source Language: Source Language
131
+ Target Language: Target Language
132
+ Pro User?: Pro User?
133
+ TRANSLATE SUBTITLE FILE: TRANSLATE SUBTITLE FILE
134
+ Upload Audio Files to separate background music: Upload Audio Files to separate background music
135
+ Instrumental: Instrumental
136
+ Vocals: Vocals
137
+ SEPARATE BACKGROUND MUSIC: SEPARATE BACKGROUND MUSIC
138
+
139
+ es: # Spanish
140
+ Language: Idioma
141
+ File: File
142
+ Youtube: Youtube
143
+ Mic: Mic
144
+ T2T Translation: T2T Translation
145
+ BGM Separation: BGM Separation
146
+ GENERATE SUBTITLE FILE: GENERATE SUBTITLE FILE
147
+ Output: Output
148
+ Downloadable output file: Downloadable output file
149
+ Upload File here: Upload File here
150
+ Model: Model
151
+ Automatic Detection: Automatic Detection
152
+ File Format: File Format
153
+ Translate to English?: Translate to English?
154
+ Add a timestamp to the end of the filename: Add a timestamp to the end of the filename
155
+ Advanced Parameters: Advanced Parameters
156
+ Background Music Remover Filter: Background Music Remover Filter
157
+ Enabling this will remove background music: Enabling this will remove background music by submodel before transcribing
158
+ Enable Background Music Remover Filter: Enable Background Music Remover Filter
159
+ Save separated files to output: Save separated files to output
160
+ Offload sub model after removing background music: Offload sub model after removing background music
161
+ Voice Detection Filter: Voice Detection Filter
162
+ Enable this to transcribe only detected voice: Enable this to transcribe only detected voice parts by submodel.
163
+ Enable Silero VAD Filter: Enable Silero VAD Filter
164
+ Diarization: Diarization
165
+ Enable Diarization: Enable Diarization
166
+ HuggingFace Token: HuggingFace Token
167
+ This is only needed the first time you download the model: This is only needed the first time you download the model. If you already have models, you don't need to enter. To download the model, you must manually go to "https://huggingface.co/pyannote/speaker-diarization-3.1" and "https://huggingface.co/pyannote/segmentation-3.0" and agree to their requirement.
168
+ Device: Device
169
+ Youtube Link: Youtube Link
170
+ Youtube Thumbnail: Youtube Thumbnail
171
+ Youtube Title: Youtube Title
172
+ Youtube Description: Youtube Description
173
+ Record with Mic: Record with Mic
174
+ Upload Subtitle Files to translate here: Upload Subtitle Files to translate here
175
+ Your Auth Key (API KEY): Your Auth Key (API KEY)
176
+ Source Language: Source Language
177
+ Target Language: Target Language
178
+ Pro User?: Pro User?
179
+ TRANSLATE SUBTITLE FILE: TRANSLATE SUBTITLE FILE
180
+ Upload Audio Files to separate background music: Upload Audio Files to separate background music
181
+ Instrumental: Instrumental
182
+ Vocals: Vocals
183
+ SEPARATE BACKGROUND MUSIC: SEPARATE BACKGROUND MUSIC
184
+
185
+ fr: # French
186
+ Language: Langue
187
+ File: File
188
+ Youtube: Youtube
189
+ Mic: Mic
190
+ T2T Translation: T2T Translation
191
+ BGM Separation: BGM Separation
192
+ GENERATE SUBTITLE FILE: GENERATE SUBTITLE FILE
193
+ Output: Output
194
+ Downloadable output file: Downloadable output file
195
+ Upload File here: Upload File here
196
+ Model: Model
197
+ Automatic Detection: Automatic Detection
198
+ File Format: File Format
199
+ Translate to English?: Translate to English?
200
+ Add a timestamp to the end of the filename: Add a timestamp to the end of the filename
201
+ Advanced Parameters: Advanced Parameters
202
+ Background Music Remover Filter: Background Music Remover Filter
203
+ Enabling this will remove background music: Enabling this will remove background music by submodel before transcribing
204
+ Enable Background Music Remover Filter: Enable Background Music Remover Filter
205
+ Save separated files to output: Save separated files to output
206
+ Offload sub model after removing background music: Offload sub model after removing background music
207
+ Voice Detection Filter: Voice Detection Filter
208
+ Enable this to transcribe only detected voice: Enable this to transcribe only detected voice parts by submodel.
209
+ Enable Silero VAD Filter: Enable Silero VAD Filter
210
+ Diarization: Diarization
211
+ Enable Diarization: Enable Diarization
212
+ HuggingFace Token: HuggingFace Token
213
+ This is only needed the first time you download the model: This is only needed the first time you download the model. If you already have models, you don't need to enter. To download the model, you must manually go to "https://huggingface.co/pyannote/speaker-diarization-3.1" and "https://huggingface.co/pyannote/segmentation-3.0" and agree to their requirement.
214
+ Device: Device
215
+ Youtube Link: Youtube Link
216
+ Youtube Thumbnail: Youtube Thumbnail
217
+ Youtube Title: Youtube Title
218
+ Youtube Description: Youtube Description
219
+ Record with Mic: Record with Mic
220
+ Upload Subtitle Files to translate here: Upload Subtitle Files to translate here
221
+ Your Auth Key (API KEY): Your Auth Key (API KEY)
222
+ Source Language: Source Language
223
+ Target Language: Target Language
224
+ Pro User?: Pro User?
225
+ TRANSLATE SUBTITLE FILE: TRANSLATE SUBTITLE FILE
226
+ Upload Audio Files to separate background music: Upload Audio Files to separate background music
227
+ Instrumental: Instrumental
228
+ Vocals: Vocals
229
+ SEPARATE BACKGROUND MUSIC: SEPARATE BACKGROUND MUSIC
230
+
231
+ de: # German
232
+ Language: Sprache
233
+ File: File
234
+ Youtube: Youtube
235
+ Mic: Mic
236
+ T2T Translation: T2T Translation
237
+ BGM Separation: BGM Separation
238
+ GENERATE SUBTITLE FILE: GENERATE SUBTITLE FILE
239
+ Output: Output
240
+ Downloadable output file: Downloadable output file
241
+ Upload File here: Upload File here
242
+ Model: Model
243
+ Automatic Detection: Automatic Detection
244
+ File Format: File Format
245
+ Translate to English?: Translate to English?
246
+ Add a timestamp to the end of the filename: Add a timestamp to the end of the filename
247
+ Advanced Parameters: Advanced Parameters
248
+ Background Music Remover Filter: Background Music Remover Filter
249
+ Enabling this will remove background music: Enabling this will remove background music by submodel before transcribing
250
+ Enable Background Music Remover Filter: Enable Background Music Remover Filter
251
+ Save separated files to output: Save separated files to output
252
+ Offload sub model after removing background music: Offload sub model after removing background music
253
+ Voice Detection Filter: Voice Detection Filter
254
+ Enable this to transcribe only detected voice: Enable this to transcribe only detected voice parts by submodel.
255
+ Enable Silero VAD Filter: Enable Silero VAD Filter
256
+ Diarization: Diarization
257
+ Enable Diarization: Enable Diarization
258
+ HuggingFace Token: HuggingFace Token
259
+ This is only needed the first time you download the model: This is only needed the first time you download the model. If you already have models, you don't need to enter. To download the model, you must manually go to "https://huggingface.co/pyannote/speaker-diarization-3.1" and "https://huggingface.co/pyannote/segmentation-3.0" and agree to their requirement.
260
+ Device: Device
261
+ Youtube Link: Youtube Link
262
+ Youtube Thumbnail: Youtube Thumbnail
263
+ Youtube Title: Youtube Title
264
+ Youtube Description: Youtube Description
265
+ Record with Mic: Record with Mic
266
+ Upload Subtitle Files to translate here: Upload Subtitle Files to translate here
267
+ Your Auth Key (API KEY): Your Auth Key (API KEY)
268
+ Source Language: Source Language
269
+ Target Language: Target Language
270
+ Pro User?: Pro User?
271
+ TRANSLATE SUBTITLE FILE: TRANSLATE SUBTITLE FILE
272
+ Upload Audio Files to separate background music: Upload Audio Files to separate background music
273
+ Instrumental: Instrumental
274
+ Vocals: Vocals
275
+ SEPARATE BACKGROUND MUSIC: SEPARATE BACKGROUND MUSIC
276
+
277
+ zh: # Chinese
278
+ Language: 语言
279
+ File: File
280
+ Youtube: Youtube
281
+ Mic: Mic
282
+ T2T Translation: T2T Translation
283
+ BGM Separation: BGM Separation
284
+ GENERATE SUBTITLE FILE: GENERATE SUBTITLE FILE
285
+ Output: Output
286
+ Downloadable output file: Downloadable output file
287
+ Upload File here: Upload File here
288
+ Model: Model
289
+ Automatic Detection: Automatic Detection
290
+ File Format: File Format
291
+ Translate to English?: Translate to English?
292
+ Add a timestamp to the end of the filename: Add a timestamp to the end of the filename
293
+ Advanced Parameters: Advanced Parameters
294
+ Background Music Remover Filter: Background Music Remover Filter
295
+ Enabling this will remove background music: Enabling this will remove background music by submodel before transcribing
296
+ Enable Background Music Remover Filter: Enable Background Music Remover Filter
297
+ Save separated files to output: Save separated files to output
298
+ Offload sub model after removing background music: Offload sub model after removing background music
299
+ Voice Detection Filter: Voice Detection Filter
300
+ Enable this to transcribe only detected voice: Enable this to transcribe only detected voice parts by submodel.
301
+ Enable Silero VAD Filter: Enable Silero VAD Filter
302
+ Diarization: Diarization
303
+ Enable Diarization: Enable Diarization
304
+ HuggingFace Token: HuggingFace Token
305
+ This is only needed the first time you download the model: This is only needed the first time you download the model. If you already have models, you don't need to enter. To download the model, you must manually go to "https://huggingface.co/pyannote/speaker-diarization-3.1" and "https://huggingface.co/pyannote/segmentation-3.0" and agree to their requirement.
306
+ Device: Device
307
+ Youtube Link: Youtube Link
308
+ Youtube Thumbnail: Youtube Thumbnail
309
+ Youtube Title: Youtube Title
310
+ Youtube Description: Youtube Description
311
+ Record with Mic: Record with Mic
312
+ Upload Subtitle Files to translate here: Upload Subtitle Files to translate here
313
+ Your Auth Key (API KEY): Your Auth Key (API KEY)
314
+ Source Language: Source Language
315
+ Target Language: Target Language
316
+ Pro User?: Pro User?
317
+ TRANSLATE SUBTITLE FILE: TRANSLATE SUBTITLE FILE
318
+ Upload Audio Files to separate background music: Upload Audio Files to separate background music
319
+ Instrumental: Instrumental
320
+ Vocals: Vocals
321
+ SEPARATE BACKGROUND MUSIC: SEPARATE BACKGROUND MUSIC
322
+
323
+ uk: # Ukrainian
324
+ Language: Мова
325
+ File: Файл
326
+ Youtube: Youtube
327
+ Mic: Мікрофон
328
+ T2T Translation: T2T Переклад
329
+ BGM Separation: Розділення фонової музики
330
+ GENERATE SUBTITLE FILE: СТВОРИТИ ФАЙЛ СУБТИТРІВ
331
+ Output: Результат
332
+ Downloadable output file: Завантажуваний файл результату
333
+ Upload File here: Завантажте файл тут
334
+ Model: Модель
335
+ Automatic Detection: Автоматичне визначення
336
+ File Format: Формат файлу
337
+ Translate to English?: Перекласти на англійську?
338
+ Add a timestamp to the end of the filename: Додати мітку часу до кінця імені файлу
339
+ Advanced Parameters: Розширені параметри
340
+ Background Music Remover Filter: Фільтр видалення фонової музики
341
+ Enabling this will remove background music: Увімкнення цього видалить фонову музику за допомогою підмоделі перед транскрипцією
342
+ Enable Background Music Remover Filter: Увімкнути фільтр видалення фонової музики
343
+ Save separated files to output: Зберегти розділені файли до вихідної папки
344
+ Offload sub model after removing background music: Вивантажити підмодель після видалення фонової музики
345
+ Voice Detection Filter: Фільтр розпізнавання голосу
346
+ Enable this to transcribe only detected voice: Увімкніть це, щоб транскрибувати лише розпізнані голосові частини за допомогою підмоделі
347
+ Enable Silero VAD Filter: Увімкнути фільтр Silero VAD
348
+ Diarization: Діаризація
349
+ Enable Diarization: Увімкнути діаризацію
350
+ HuggingFace Token: Токен HuggingFace
351
+ This is only needed the first time you download the model: Це потрібно лише при першому завантаженні моделі. Якщо у вас вже є моделі, вводити не потрібно. Щоб завантажити модель, потрібно вручну перейти на "https://huggingface.co/pyannote/speaker-diarization-3.1" та "https://huggingface.co/pyannote/segmentation-3.0" і погодитися з їхніми вимогами.
352
+ Device: Пристрій
353
+ Youtube Link: Посилання на Youtube
354
+ Youtube Thumbnail: Ескіз Youtube
355
+ Youtube Title: Назва Youtube
356
+ Youtube Description: Опис Youtube
357
+ Record with Mic: Записати з мікрофона
358
+ Upload Subtitle Files to translate here: Завантажте файли субтитрів для перекладу тут
359
+ Your Auth Key (API KEY): Ваш ключ авторизації (API KEY)
360
+ Source Language: Мова джерела
361
+ Target Language: Мова перекладу
362
+ Pro User?: Професійний користувач?
363
+ TRANSLATE SUBTITLE FILE: ПЕРЕКЛАСТИ ФАЙЛ СУБТИТРІВ
364
+ Upload Audio Files to separate background music: Завантажте аудіофайли для розділення фонової музики
365
+ Instrumental: Інструментал
366
+ Vocals: Вокал
367
+ SEPARATE BACKGROUND MUSIC: РОЗДІЛИТИ ФОНОВУ МУЗИКУ
368
+
369
+ ru: # Russian
370
+ Language: Язык
371
+ File: Файл
372
+ Youtube: Youtube
373
+ Mic: Микрофон
374
+ T2T Translation: Перевод T2T
375
+ BGM Separation: Разделение фоновой музыки
376
+ GENERATE SUBTITLE FILE: СГЕНЕРИРОВАТЬ ФАЙЛ СУБТИТРОВ
377
+ Output: Результат
378
+ Downloadable output file: Загружаемый файл результата
379
+ Upload File here: Загрузите файл здесь
380
+ Model: Модель
381
+ Automatic Detection: Автоматическое определение
382
+ File Format: Формат файла
383
+ Translate to English?: Перевести на английский?
384
+ Add a timestamp to the end of the filename: Добавить метку времени в конец имени файла
385
+ Advanced Parameters: Расширенные параметры
386
+ Background Music Remover Filter: Фильтр удаления фоновой музыки
387
+ Enabling this will remove background music: Включение этого удалит фоновую музыку с помощью подмодели перед транскрипцией
388
+ Enable Background Music Remover Filter: Включить фильтр удаления фоновой музыки
389
+ Save separated files to output: Сохранить разделенные файлы в выходную папку
390
+ Offload sub model after removing background music: Выгрузить подмодель после удаления фоновой музыки
391
+ Voice Detection Filter: Фильтр обнаружения голоса
392
+ Enable this to transcribe only detected voice: Включите это, чтобы транскрибировать только обнаруженные голосовые части с помощью подмодели
393
+ Enable Silero VAD Filter: Включить фильтр Silero VAD
394
+ Diarization: Диаризация
395
+ Enable Diarization: Включить диаризацию
396
+ HuggingFace Token: Токен HuggingFace
397
+ This is only needed the first time you download the model: Это нужно только при первом скачивании модели. Если у вас уже есть модели, вводить не нужно. Чтобы скачать модель, нужно вручную перейти на "https://huggingface.co/pyannote/speaker-diarization-3.1" и "https://huggingface.co/pyannote/segmentation-3.0" и согласиться с их требованиями.
398
+ Device: Устройство
399
+ Youtube Link: Ссылка на Youtube
400
+ Youtube Thumbnail: Миниатюра Youtube
401
+ Youtube Title: Название Youtube
402
+ Youtube Description: Описание Youtube
403
+ Record with Mic: Записать с микрофона
404
+ Upload Subtitle Files to translate here: Загрузите файлы субтитров для перевода здесь
405
+ Your Auth Key (API KEY): Ваш Auth Key (API KEY)
406
+ Source Language: Исходный язык
407
+ Target Language: Целевой язык
408
+ Pro User?: Профессиональный пользователь?
409
+ TRANSLATE SUBTITLE FILE: ПЕРЕВЕСТИ ФАЙЛ СУБТИТРОВ
410
+ Upload Audio Files to separate background music: Загрузите аудиофайлы для разделения фоновой музыки
411
+ Instrumental: Инструментал
412
+ Vocals: Вокал
413
+ SEPARATE BACKGROUND MUSIC: РАЗДЕЛИТЬ ФОНОВУЮ МУЗЫКУ
414
+
415
+ tr: # Turkish
416
+ Language: Dil
417
+ File: Dosya
418
+ Youtube: Youtube
419
+ Mic: Mikrofon
420
+ T2T Translation: T2T Çeviri
421
+ BGM Separation: Arka Plan Müziği Ayırma
422
+ GENERATE SUBTITLE FILE: ALTYAZI DOSYASI OLUŞTUR
423
+ Output: Çıktı
424
+ Downloadable output file: İndirilebilir çıktı dosyası
425
+ Upload File here: Dosya Yükle
426
+ Model: Model
427
+ Automatic Detection: Otomatik Algılama
428
+ File Format: Dosya Formatı
429
+ Translate to English?: İngilizceye Çevir?
430
+ Add a timestamp to the end of the filename: Dosya adının sonuna zaman damgası ekle
431
+ Advanced Parameters: Gelişmiş Parametreler
432
+ Background Music Remover Filter: Arka Plan Müziği Kaldırma Filtresi
433
+ Enabling this will remove background music: Bunu etkinleştirmek, arka plan müziğini alt model tarafından transkripsiyondan önce kaldıracaktır
434
+ Enable Background Music Remover Filter: Arka Plan Müziği Kaldırma Filtresini Etkinleştir
435
+ Save separated files to output: Ayrılmış dosyaları çıktıya kaydet
436
+ Offload sub model after removing background music: Arka plan müziği kaldırıldıktan sonra alt modeli devre dışı bırak
437
+ Voice Detection Filter: Ses Algılama Filtresi
438
+ Enable this to transcribe only detected voice: Bunu etkinleştirerek yalnızca alt model tarafından algılanan ses kısımlarını transkribe et
439
+ Enable Silero VAD Filter: Silero VAD Filtresini Etkinleştir
440
+ Diarization: Konuşmacı Ayrımı
441
+ Enable Diarization: Konuşmacı Ayrımını Etkinleştir
442
+ HuggingFace Token: HuggingFace Anahtarı
443
+ This is only needed the first time you download the model: Bu, modeli ilk kez indirirken gereklidir. Zaten modelleriniz varsa girmenize gerek yok. Modeli indirmek için "https://huggingface.co/pyannote/speaker-diarization-3.1" ve "https://huggingface.co/pyannote/segmentation-3.0" adreslerine gidip gereksinimlerini kabul etmeniz gerekiyor
444
+ Device: Cihaz
445
+ Youtube Link: Youtube Bağlantısı
446
+ Youtube Thumbnail: Youtube Küçük Resmi
447
+ Youtube Title: Youtube Başlığı
448
+ Youtube Description: Youtube Açıklaması
449
+ Record with Mic: Mikrofonla Kaydet
450
+ Upload Subtitle Files to translate here: Çeviri için altyazı dosyalarını buraya yükle
451
+ Your Auth Key (API KEY): Yetki Anahtarınız (API ANAHTARI)
452
+ Source Language: Kaynak Dil
453
+ Target Language: Hedef Dil
454
+ Pro User?: Pro Kullanıcı?
455
+ TRANSLATE SUBTITLE FILE: ALTYAZI DOSYASINI ÇEVİR
456
+ Upload Audio Files to separate background music: Arka plan müziğini ayırmak için ses dosyalarını yükle
457
+ Instrumental: Enstrümantal
458
+ Vocals: Vokal
459
+ SEPARATE BACKGROUND MUSIC: ARKA PLAN MÜZİĞİNİ AYIR
docker-compose.yaml ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ services:
2
+ app:
3
+ build: .
4
+ image: jhj0517/whisper-webui:latest
5
+
6
+ volumes:
7
+ # Update paths to mount models and output paths to your custom paths like this, e.g:
8
+ # - C:/whisper-models/custom-path:/Whisper-WebUI/models
9
+ # - C:/whisper-webui-outputs/custom-path:/Whisper-WebUI/outputs
10
+ - /Whisper-WebUI/models
11
+ - /Whisper-WebUI/outputs
12
+
13
+ ports:
14
+ - "7860:7860"
15
+
16
+ stdin_open: true
17
+ tty: true
18
+
19
+ entrypoint: ["python", "app.py", "--server_port", "7860", "--server_name", "0.0.0.0",]
20
+
21
+ # If you're not using nvidia GPU, Update device to match yours.
22
+ # See more info at : https://docs.docker.com/compose/compose-file/deploy/#driver
23
+ deploy:
24
+ resources:
25
+ reservations:
26
+ devices:
27
+ - driver: nvidia
28
+ count: all
29
+ capabilities: [ gpu ]
modules/__init__.py ADDED
File without changes
modules/diarize/__init__.py ADDED
File without changes
modules/diarize/audio_loader.py ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Adapted from https://github.com/m-bain/whisperX/blob/main/whisperx/audio.py
2
+
3
+ import os
4
+ import subprocess
5
+ from functools import lru_cache
6
+ from typing import Optional, Union
7
+ from scipy.io.wavfile import write
8
+ import tempfile
9
+
10
+ import numpy as np
11
+ import torch
12
+ import torch.nn.functional as F
13
+
14
+ def exact_div(x, y):
15
+ assert x % y == 0
16
+ return x // y
17
+
18
+ # hard-coded audio hyperparameters
19
+ SAMPLE_RATE = 16000
20
+ N_FFT = 400
21
+ HOP_LENGTH = 160
22
+ CHUNK_LENGTH = 30
23
+ N_SAMPLES = CHUNK_LENGTH * SAMPLE_RATE # 480000 samples in a 30-second chunk
24
+ N_FRAMES = exact_div(N_SAMPLES, HOP_LENGTH) # 3000 frames in a mel spectrogram input
25
+
26
+ N_SAMPLES_PER_TOKEN = HOP_LENGTH * 2 # the initial convolutions has stride 2
27
+ FRAMES_PER_SECOND = exact_div(SAMPLE_RATE, HOP_LENGTH) # 10ms per audio frame
28
+ TOKENS_PER_SECOND = exact_div(SAMPLE_RATE, N_SAMPLES_PER_TOKEN) # 20ms per audio token
29
+
30
+
31
+ def load_audio(file: Union[str, np.ndarray], sr: int = SAMPLE_RATE) -> np.ndarray:
32
+ """
33
+ Open an audio file or process a numpy array containing audio data as mono waveform, resampling as necessary.
34
+
35
+ Parameters
36
+ ----------
37
+ file: Union[str, np.ndarray]
38
+ The audio file to open or a numpy array containing the audio data.
39
+
40
+ sr: int
41
+ The sample rate to resample the audio if necessary.
42
+
43
+ Returns
44
+ -------
45
+ A NumPy array containing the audio waveform, in float32 dtype.
46
+ """
47
+ if isinstance(file, np.ndarray):
48
+ if file.dtype != np.float32:
49
+ file = file.astype(np.float32)
50
+ if file.ndim > 1:
51
+ file = np.mean(file, axis=1)
52
+
53
+ temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
54
+ write(temp_file.name, SAMPLE_RATE, (file * 32768).astype(np.int16))
55
+ temp_file_path = temp_file.name
56
+ temp_file.close()
57
+ else:
58
+ temp_file_path = file
59
+
60
+ try:
61
+ cmd = [
62
+ "ffmpeg",
63
+ "-nostdin",
64
+ "-threads",
65
+ "0",
66
+ "-i",
67
+ temp_file_path,
68
+ "-f",
69
+ "s16le",
70
+ "-ac",
71
+ "1",
72
+ "-acodec",
73
+ "pcm_s16le",
74
+ "-ar",
75
+ str(sr),
76
+ "-",
77
+ ]
78
+ out = subprocess.run(cmd, capture_output=True, check=True).stdout
79
+ except subprocess.CalledProcessError as e:
80
+ raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
81
+ finally:
82
+ if isinstance(file, np.ndarray):
83
+ os.remove(temp_file_path)
84
+
85
+ return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0
86
+
87
+
88
+ def pad_or_trim(array, length: int = N_SAMPLES, *, axis: int = -1):
89
+ """
90
+ Pad or trim the audio array to N_SAMPLES, as expected by the encoder.
91
+ """
92
+ if torch.is_tensor(array):
93
+ if array.shape[axis] > length:
94
+ array = array.index_select(
95
+ dim=axis, index=torch.arange(length, device=array.device)
96
+ )
97
+
98
+ if array.shape[axis] < length:
99
+ pad_widths = [(0, 0)] * array.ndim
100
+ pad_widths[axis] = (0, length - array.shape[axis])
101
+ array = F.pad(array, [pad for sizes in pad_widths[::-1] for pad in sizes])
102
+ else:
103
+ if array.shape[axis] > length:
104
+ array = array.take(indices=range(length), axis=axis)
105
+
106
+ if array.shape[axis] < length:
107
+ pad_widths = [(0, 0)] * array.ndim
108
+ pad_widths[axis] = (0, length - array.shape[axis])
109
+ array = np.pad(array, pad_widths)
110
+
111
+ return array
112
+
113
+
114
+ @lru_cache(maxsize=None)
115
+ def mel_filters(device, n_mels: int) -> torch.Tensor:
116
+ """
117
+ load the mel filterbank matrix for projecting STFT into a Mel spectrogram.
118
+ Allows decoupling librosa dependency; saved using:
119
+
120
+ np.savez_compressed(
121
+ "mel_filters.npz",
122
+ mel_80=librosa.filters.mel(sr=16000, n_fft=400, n_mels=80),
123
+ )
124
+ """
125
+ assert n_mels in [80, 128], f"Unsupported n_mels: {n_mels}"
126
+ with np.load(
127
+ os.path.join(os.path.dirname(__file__), "assets", "mel_filters.npz")
128
+ ) as f:
129
+ return torch.from_numpy(f[f"mel_{n_mels}"]).to(device)
130
+
131
+
132
+ def log_mel_spectrogram(
133
+ audio: Union[str, np.ndarray, torch.Tensor],
134
+ n_mels: int,
135
+ padding: int = 0,
136
+ device: Optional[Union[str, torch.device]] = None,
137
+ ):
138
+ """
139
+ Compute the log-Mel spectrogram of
140
+
141
+ Parameters
142
+ ----------
143
+ audio: Union[str, np.ndarray, torch.Tensor], shape = (*)
144
+ The path to audio or either a NumPy array or Tensor containing the audio waveform in 16 kHz
145
+
146
+ n_mels: int
147
+ The number of Mel-frequency filters, only 80 is supported
148
+
149
+ padding: int
150
+ Number of zero samples to pad to the right
151
+
152
+ device: Optional[Union[str, torch.device]]
153
+ If given, the audio tensor is moved to this device before STFT
154
+
155
+ Returns
156
+ -------
157
+ torch.Tensor, shape = (80, n_frames)
158
+ A Tensor that contains the Mel spectrogram
159
+ """
160
+ if not torch.is_tensor(audio):
161
+ if isinstance(audio, str):
162
+ audio = load_audio(audio)
163
+ audio = torch.from_numpy(audio)
164
+
165
+ if device is not None:
166
+ audio = audio.to(device)
167
+ if padding > 0:
168
+ audio = F.pad(audio, (0, padding))
169
+ window = torch.hann_window(N_FFT).to(audio.device)
170
+ stft = torch.stft(audio, N_FFT, HOP_LENGTH, window=window, return_complex=True)
171
+ magnitudes = stft[..., :-1].abs() ** 2
172
+
173
+ filters = mel_filters(audio.device, n_mels)
174
+ mel_spec = filters @ magnitudes
175
+
176
+ log_spec = torch.clamp(mel_spec, min=1e-10).log10()
177
+ log_spec = torch.maximum(log_spec, log_spec.max() - 8.0)
178
+ log_spec = (log_spec + 4.0) / 4.0
179
+ return log_spec
modules/diarize/diarize_pipeline.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Adapted from https://github.com/m-bain/whisperX/blob/main/whisperx/diarize.py
2
+
3
+ import numpy as np
4
+ import pandas as pd
5
+ import os
6
+ from pyannote.audio import Pipeline
7
+ from typing import Optional, Union
8
+ import torch
9
+
10
+ from modules.whisper.data_classes import *
11
+ from modules.utils.paths import DIARIZATION_MODELS_DIR
12
+ from modules.diarize.audio_loader import load_audio, SAMPLE_RATE
13
+
14
+
15
+ class DiarizationPipeline:
16
+ def __init__(
17
+ self,
18
+ model_name="pyannote/speaker-diarization-3.1",
19
+ cache_dir: str = DIARIZATION_MODELS_DIR,
20
+ use_auth_token=None,
21
+ device: Optional[Union[str, torch.device]] = "cpu",
22
+ ):
23
+ if isinstance(device, str):
24
+ device = torch.device(device)
25
+ self.model = Pipeline.from_pretrained(
26
+ model_name,
27
+ use_auth_token=use_auth_token,
28
+ cache_dir=cache_dir
29
+ ).to(device)
30
+
31
+ def __call__(self, audio: Union[str, np.ndarray], min_speakers=None, max_speakers=None):
32
+ if isinstance(audio, str):
33
+ audio = load_audio(audio)
34
+ audio_data = {
35
+ 'waveform': torch.from_numpy(audio[None, :]),
36
+ 'sample_rate': SAMPLE_RATE
37
+ }
38
+ segments = self.model(audio_data, min_speakers=min_speakers, max_speakers=max_speakers)
39
+ diarize_df = pd.DataFrame(segments.itertracks(yield_label=True), columns=['segment', 'label', 'speaker'])
40
+ diarize_df['start'] = diarize_df['segment'].apply(lambda x: x.start)
41
+ diarize_df['end'] = diarize_df['segment'].apply(lambda x: x.end)
42
+ return diarize_df
43
+
44
+
45
+ def assign_word_speakers(diarize_df, transcript_result, fill_nearest=False):
46
+ transcript_segments = transcript_result["segments"]
47
+ if transcript_segments and isinstance(transcript_segments[0], Segment):
48
+ transcript_segments = [seg.model_dump() for seg in transcript_segments]
49
+ for seg in transcript_segments:
50
+ # assign speaker to segment (if any)
51
+ diarize_df['intersection'] = np.minimum(diarize_df['end'], seg['end']) - np.maximum(diarize_df['start'],
52
+ seg['start'])
53
+ diarize_df['union'] = np.maximum(diarize_df['end'], seg['end']) - np.minimum(diarize_df['start'], seg['start'])
54
+
55
+ intersected = diarize_df[diarize_df["intersection"] > 0]
56
+
57
+ speaker = None
58
+ if len(intersected) > 0:
59
+ # Choosing most strong intersection
60
+ speaker = intersected.groupby("speaker")["intersection"].sum().sort_values(ascending=False).index[0]
61
+ elif fill_nearest:
62
+ # Otherwise choosing closest
63
+ speaker = diarize_df.sort_values(by=["intersection"], ascending=False)["speaker"].values[0]
64
+
65
+ if speaker is not None:
66
+ seg["speaker"] = speaker
67
+
68
+ # assign speaker to words
69
+ if 'words' in seg and seg['words'] is not None:
70
+ for word in seg['words']:
71
+ if 'start' in word:
72
+ diarize_df['intersection'] = np.minimum(diarize_df['end'], word['end']) - np.maximum(
73
+ diarize_df['start'], word['start'])
74
+ diarize_df['union'] = np.maximum(diarize_df['end'], word['end']) - np.minimum(diarize_df['start'],
75
+ word['start'])
76
+
77
+ intersected = diarize_df[diarize_df["intersection"] > 0]
78
+
79
+ word_speaker = None
80
+ if len(intersected) > 0:
81
+ # Choosing most strong intersection
82
+ word_speaker = \
83
+ intersected.groupby("speaker")["intersection"].sum().sort_values(ascending=False).index[0]
84
+ elif fill_nearest:
85
+ # Otherwise choosing closest
86
+ word_speaker = diarize_df.sort_values(by=["intersection"], ascending=False)["speaker"].values[0]
87
+
88
+ if word_speaker is not None:
89
+ word["speaker"] = word_speaker
90
+
91
+ return {"segments": transcript_segments}
92
+
93
+
94
+ class DiarizationSegment:
95
+ def __init__(self, start, end, speaker=None):
96
+ self.start = start
97
+ self.end = end
98
+ self.speaker = speaker
modules/diarize/diarizer.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ from typing import List, Union, BinaryIO, Optional, Tuple
4
+ import numpy as np
5
+ import time
6
+ import logging
7
+ import spaces
8
+
9
+ from modules.utils.paths import DIARIZATION_MODELS_DIR
10
+ from modules.diarize.diarize_pipeline import DiarizationPipeline, assign_word_speakers
11
+ from modules.diarize.audio_loader import load_audio
12
+ from modules.whisper.data_classes import *
13
+
14
+
15
+ class Diarizer:
16
+ def __init__(self,
17
+ model_dir: str = DIARIZATION_MODELS_DIR
18
+ ):
19
+ self.device = self.get_device()
20
+ self.available_device = self.get_available_device()
21
+ self.compute_type = "float16"
22
+ self.model_dir = model_dir
23
+ os.makedirs(self.model_dir, exist_ok=True)
24
+ self.pipe = None
25
+
26
+ @spaces.GPU
27
+ def run(self,
28
+ audio: Union[str, BinaryIO, np.ndarray],
29
+ transcribed_result: List[Segment],
30
+ use_auth_token: str,
31
+ device: Optional[str] = None
32
+ ) -> Tuple[List[Segment], float]:
33
+ """
34
+ Diarize transcribed result as a post-processing
35
+
36
+ Parameters
37
+ ----------
38
+ audio: Union[str, BinaryIO, np.ndarray]
39
+ Audio input. This can be file path or binary type.
40
+ transcribed_result: List[Segment]
41
+ transcribed result through whisper.
42
+ use_auth_token: str
43
+ Huggingface token with READ permission. This is only needed the first time you download the model.
44
+ You must manually go to the website https://huggingface.co/pyannote/speaker-diarization-3.1 and agree to their TOS to download the model.
45
+ device: Optional[str]
46
+ Device for diarization.
47
+
48
+ Returns
49
+ ----------
50
+ segments_result: List[Segment]
51
+ list of Segment that includes start, end timestamps and transcribed text
52
+ elapsed_time: float
53
+ elapsed time for running
54
+ """
55
+ start_time = time.time()
56
+
57
+ if device is None:
58
+ device = self.device
59
+
60
+ if device != self.device or self.pipe is None:
61
+ self.update_pipe(
62
+ device=device,
63
+ use_auth_token=use_auth_token
64
+ )
65
+
66
+ audio = load_audio(audio)
67
+
68
+ diarization_segments = self.pipe(audio)
69
+ diarized_result = assign_word_speakers(
70
+ diarization_segments,
71
+ {"segments": transcribed_result}
72
+ )
73
+
74
+ segments_result = []
75
+ for segment in diarized_result["segments"]:
76
+ speaker = "None"
77
+ if "speaker" in segment:
78
+ speaker = segment["speaker"]
79
+ diarized_text = speaker + "|" + segment["text"].strip()
80
+ segments_result.append(Segment(
81
+ start=segment["start"],
82
+ end=segment["end"],
83
+ text=diarized_text
84
+ ))
85
+
86
+ elapsed_time = time.time() - start_time
87
+ return segments_result, elapsed_time
88
+
89
+ @spaces.GPU
90
+ def update_pipe(self,
91
+ use_auth_token: str,
92
+ device: str
93
+ ):
94
+ """
95
+ Set pipeline for diarization
96
+
97
+ Parameters
98
+ ----------
99
+ use_auth_token: str
100
+ Huggingface token with READ permission. This is only needed the first time you download the model.
101
+ You must manually go to the website https://huggingface.co/pyannote/speaker-diarization-3.1 and agree to their TOS to download the model.
102
+ device: str
103
+ Device for diarization.
104
+ """
105
+ self.device = device
106
+
107
+ os.makedirs(self.model_dir, exist_ok=True)
108
+
109
+ if (not os.listdir(self.model_dir) and
110
+ not use_auth_token):
111
+ print(
112
+ "\nFailed to diarize. You need huggingface token and agree to their requirements to download the diarization model.\n"
113
+ "Go to \"https://huggingface.co/pyannote/speaker-diarization-3.1\" and follow their instructions to download the model.\n"
114
+ )
115
+ return
116
+
117
+ logger = logging.getLogger("speechbrain.utils.train_logger")
118
+ # Disable redundant torchvision warning message
119
+ logger.disabled = True
120
+ self.pipe = DiarizationPipeline(
121
+ use_auth_token=use_auth_token,
122
+ device=device,
123
+ cache_dir=self.model_dir
124
+ )
125
+ logger.disabled = False
126
+
127
+ @staticmethod
128
+ @spaces.GPU
129
+ def get_device():
130
+ if torch.cuda.is_available():
131
+ return "cuda"
132
+ elif torch.backends.mps.is_available():
133
+ return "mps"
134
+ else:
135
+ return "cpu"
136
+
137
+ @staticmethod
138
+ @spaces.GPU
139
+ def get_available_device():
140
+ devices = ["cpu"]
141
+ if torch.cuda.is_available():
142
+ devices.append("cuda")
143
+ elif torch.backends.mps.is_available():
144
+ devices.append("mps")
145
+ return devices
modules/translation/__init__.py ADDED
File without changes
modules/translation/deepl_api.py ADDED
@@ -0,0 +1,217 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import time
3
+ import os
4
+ from datetime import datetime
5
+ import gradio as gr
6
+
7
+ from modules.utils.paths import TRANSLATION_OUTPUT_DIR, DEFAULT_PARAMETERS_CONFIG_PATH
8
+ from modules.utils.constants import AUTOMATIC_DETECTION
9
+ from modules.utils.subtitle_manager import *
10
+ from modules.utils.files_manager import load_yaml, save_yaml
11
+
12
+ """
13
+ This is written with reference to the DeepL API documentation.
14
+ If you want to know the information of the DeepL API, see here: https://www.deepl.com/docs-api/documents
15
+ """
16
+
17
+ DEEPL_AVAILABLE_TARGET_LANGS = {
18
+ 'Bulgarian': 'BG',
19
+ 'Czech': 'CS',
20
+ 'Danish': 'DA',
21
+ 'German': 'DE',
22
+ 'Greek': 'EL',
23
+ 'English': 'EN',
24
+ 'English (British)': 'EN-GB',
25
+ 'English (American)': 'EN-US',
26
+ 'Spanish': 'ES',
27
+ 'Estonian': 'ET',
28
+ 'Finnish': 'FI',
29
+ 'French': 'FR',
30
+ 'Hungarian': 'HU',
31
+ 'Indonesian': 'ID',
32
+ 'Italian': 'IT',
33
+ 'Japanese': 'JA',
34
+ 'Korean': 'KO',
35
+ 'Lithuanian': 'LT',
36
+ 'Latvian': 'LV',
37
+ 'Norwegian (Bokmål)': 'NB',
38
+ 'Dutch': 'NL',
39
+ 'Polish': 'PL',
40
+ 'Portuguese': 'PT',
41
+ 'Portuguese (Brazilian)': 'PT-BR',
42
+ 'Portuguese (all Portuguese varieties excluding Brazilian Portuguese)': 'PT-PT',
43
+ 'Romanian': 'RO',
44
+ 'Russian': 'RU',
45
+ 'Slovak': 'SK',
46
+ 'Slovenian': 'SL',
47
+ 'Swedish': 'SV',
48
+ 'Turkish': 'TR',
49
+ 'Ukrainian': 'UK',
50
+ 'Chinese (simplified)': 'ZH'
51
+ }
52
+
53
+ DEEPL_AVAILABLE_SOURCE_LANGS = {
54
+ AUTOMATIC_DETECTION: None,
55
+ 'Bulgarian': 'BG',
56
+ 'Czech': 'CS',
57
+ 'Danish': 'DA',
58
+ 'German': 'DE',
59
+ 'Greek': 'EL',
60
+ 'English': 'EN',
61
+ 'Spanish': 'ES',
62
+ 'Estonian': 'ET',
63
+ 'Finnish': 'FI',
64
+ 'French': 'FR',
65
+ 'Hungarian': 'HU',
66
+ 'Indonesian': 'ID',
67
+ 'Italian': 'IT',
68
+ 'Japanese': 'JA',
69
+ 'Korean': 'KO',
70
+ 'Lithuanian': 'LT',
71
+ 'Latvian': 'LV',
72
+ 'Norwegian (Bokmål)': 'NB',
73
+ 'Dutch': 'NL',
74
+ 'Polish': 'PL',
75
+ 'Portuguese (all Portuguese varieties mixed)': 'PT',
76
+ 'Romanian': 'RO',
77
+ 'Russian': 'RU',
78
+ 'Slovak': 'SK',
79
+ 'Slovenian': 'SL',
80
+ 'Swedish': 'SV',
81
+ 'Turkish': 'TR',
82
+ 'Ukrainian': 'UK',
83
+ 'Chinese': 'ZH'
84
+ }
85
+
86
+
87
+ class DeepLAPI:
88
+ def __init__(self,
89
+ output_dir: str = TRANSLATION_OUTPUT_DIR
90
+ ):
91
+ self.api_interval = 1
92
+ self.max_text_batch_size = 50
93
+ self.available_target_langs = DEEPL_AVAILABLE_TARGET_LANGS
94
+ self.available_source_langs = DEEPL_AVAILABLE_SOURCE_LANGS
95
+ self.output_dir = output_dir
96
+
97
+ def translate_deepl(self,
98
+ auth_key: str,
99
+ fileobjs: list,
100
+ source_lang: str,
101
+ target_lang: str,
102
+ is_pro: bool = False,
103
+ add_timestamp: bool = True,
104
+ progress=gr.Progress()) -> list:
105
+ """
106
+ Translate subtitle files using DeepL API
107
+ Parameters
108
+ ----------
109
+ auth_key: str
110
+ API Key for DeepL from gr.Textbox()
111
+ fileobjs: list
112
+ List of files to transcribe from gr.Files()
113
+ source_lang: str
114
+ Source language of the file to transcribe from gr.Dropdown()
115
+ target_lang: str
116
+ Target language of the file to transcribe from gr.Dropdown()
117
+ is_pro: str
118
+ Boolean value that is about pro user or not from gr.Checkbox().
119
+ add_timestamp: bool
120
+ Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename.
121
+ progress: gr.Progress
122
+ Indicator to show progress directly in gradio.
123
+
124
+ Returns
125
+ ----------
126
+ A List of
127
+ String to return to gr.Textbox()
128
+ Files to return to gr.Files()
129
+ """
130
+ if fileobjs and isinstance(fileobjs[0], gr.utils.NamedString):
131
+ fileobjs = [fileobj.name for fileobj in fileobjs]
132
+
133
+ self.cache_parameters(
134
+ api_key=auth_key,
135
+ is_pro=is_pro,
136
+ source_lang=source_lang,
137
+ target_lang=target_lang,
138
+ add_timestamp=add_timestamp
139
+ )
140
+
141
+ files_info = {}
142
+ for file_path in fileobjs:
143
+ file_name, file_ext = os.path.splitext(os.path.basename(file_path))
144
+ writer = get_writer(file_ext, self.output_dir)
145
+ segments = writer.to_segments(file_path)
146
+
147
+ batch_size = self.max_text_batch_size
148
+ for batch_start in range(0, len(segments), batch_size):
149
+ progress(batch_start / len(segments), desc="Translating..")
150
+ sentences_to_translate = [seg.text for seg in segments[batch_start:batch_start+batch_size]]
151
+ translated_texts = self.request_deepl_translate(auth_key, sentences_to_translate, source_lang,
152
+ target_lang, is_pro)
153
+ for i, translated_text in enumerate(translated_texts):
154
+ segments[batch_start + i].text = translated_text["text"]
155
+
156
+ subtitle, output_path = generate_file(
157
+ output_dir=self.output_dir,
158
+ output_file_name=file_name,
159
+ output_format=file_ext,
160
+ result=segments,
161
+ add_timestamp=add_timestamp
162
+ )
163
+
164
+ files_info[file_name] = {"subtitle": subtitle, "path": output_path}
165
+
166
+ total_result = ''
167
+ for file_name, info in files_info.items():
168
+ total_result += '------------------------------------\n'
169
+ total_result += f'{file_name}\n\n'
170
+ total_result += f'{info["subtitle"]}'
171
+ gr_str = f"Done! Subtitle is in the outputs/translation folder.\n\n{total_result}"
172
+
173
+ output_file_paths = [item["path"] for key, item in files_info.items()]
174
+ return [gr_str, output_file_paths]
175
+
176
+ def request_deepl_translate(self,
177
+ auth_key: str,
178
+ text: list,
179
+ source_lang: str,
180
+ target_lang: str,
181
+ is_pro: bool = False):
182
+ """Request API response to DeepL server"""
183
+ if source_lang not in list(DEEPL_AVAILABLE_SOURCE_LANGS.keys()):
184
+ raise ValueError(f"Source language {source_lang} is not supported."
185
+ f"Use one of {list(DEEPL_AVAILABLE_SOURCE_LANGS.keys())}")
186
+ if target_lang not in list(DEEPL_AVAILABLE_TARGET_LANGS.keys()):
187
+ raise ValueError(f"Target language {target_lang} is not supported."
188
+ f"Use one of {list(DEEPL_AVAILABLE_TARGET_LANGS.keys())}")
189
+
190
+ url = 'https://api.deepl.com/v2/translate' if is_pro else 'https://api-free.deepl.com/v2/translate'
191
+ headers = {
192
+ 'Authorization': f'DeepL-Auth-Key {auth_key}'
193
+ }
194
+ data = {
195
+ 'text': text,
196
+ 'source_lang': DEEPL_AVAILABLE_SOURCE_LANGS[source_lang],
197
+ 'target_lang': DEEPL_AVAILABLE_TARGET_LANGS[target_lang]
198
+ }
199
+ response = requests.post(url, headers=headers, data=data).json()
200
+ time.sleep(self.api_interval)
201
+ return response["translations"]
202
+
203
+ @staticmethod
204
+ def cache_parameters(api_key: str,
205
+ is_pro: bool,
206
+ source_lang: str,
207
+ target_lang: str,
208
+ add_timestamp: bool):
209
+ cached_params = load_yaml(DEFAULT_PARAMETERS_CONFIG_PATH)
210
+ cached_params["translation"]["deepl"] = {
211
+ "api_key": api_key,
212
+ "is_pro": is_pro,
213
+ "source_lang": source_lang,
214
+ "target_lang": target_lang
215
+ }
216
+ cached_params["translation"]["add_timestamp"] = add_timestamp
217
+ save_yaml(cached_params, DEFAULT_PARAMETERS_CONFIG_PATH)
modules/translation/nllb_inference.py ADDED
@@ -0,0 +1,289 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
2
+ import gradio as gr
3
+ import os
4
+ import spaces
5
+
6
+ from modules.utils.paths import TRANSLATION_OUTPUT_DIR, NLLB_MODELS_DIR
7
+ import modules.translation.translation_base as base
8
+
9
+
10
+ class NLLBInference(base.TranslationBase):
11
+ def __init__(self,
12
+ model_dir: str = NLLB_MODELS_DIR,
13
+ output_dir: str = TRANSLATION_OUTPUT_DIR
14
+ ):
15
+ super().__init__(
16
+ model_dir=model_dir,
17
+ output_dir=output_dir
18
+ )
19
+ self.tokenizer = None
20
+ self.available_models = ["facebook/nllb-200-3.3B", "facebook/nllb-200-1.3B", "facebook/nllb-200-distilled-600M"]
21
+ self.available_source_langs = list(NLLB_AVAILABLE_LANGS.keys())
22
+ self.available_target_langs = list(NLLB_AVAILABLE_LANGS.keys())
23
+ self.pipeline = None
24
+
25
+ @spaces.GPU(duration=120)
26
+ def translate(self,
27
+ text: str,
28
+ max_length: int
29
+ ):
30
+ result = self.pipeline(
31
+ text,
32
+ max_length=max_length
33
+ )
34
+ return result[0]["translation_text"]
35
+
36
+ @spaces.GPU(duration=120)
37
+ def update_model(self,
38
+ model_size: str,
39
+ src_lang: str,
40
+ tgt_lang: str,
41
+ progress: gr.Progress = gr.Progress()
42
+ ):
43
+ def validate_language(lang: str) -> str:
44
+ if lang in NLLB_AVAILABLE_LANGS:
45
+ return NLLB_AVAILABLE_LANGS[lang]
46
+ elif lang not in NLLB_AVAILABLE_LANGS.values():
47
+ raise ValueError(f"Language '{lang}' is not supported. Use one of: {list(NLLB_AVAILABLE_LANGS.keys())}")
48
+ return lang
49
+
50
+ src_lang = validate_language(src_lang)
51
+ tgt_lang = validate_language(tgt_lang)
52
+
53
+ if model_size != self.current_model_size or self.model is None:
54
+ print("\nInitializing NLLB Model..\n")
55
+ progress(0, desc="Initializing NLLB Model..")
56
+ self.current_model_size = model_size
57
+ local_files_only = self.is_model_exists(self.current_model_size)
58
+ self.model = AutoModelForSeq2SeqLM.from_pretrained(pretrained_model_name_or_path=model_size,
59
+ cache_dir=self.model_dir,
60
+ local_files_only=local_files_only)
61
+ self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_size,
62
+ cache_dir=os.path.join(self.model_dir, "tokenizers"),
63
+ local_files_only=local_files_only)
64
+
65
+ self.pipeline = pipeline("translation",
66
+ model=self.model,
67
+ tokenizer=self.tokenizer,
68
+ src_lang=src_lang,
69
+ tgt_lang=tgt_lang,
70
+ device=self.device)
71
+
72
+ def is_model_exists(self,
73
+ model_size: str):
74
+ """Check if model exists or not (Only facebook model)"""
75
+ prefix = "models--facebook--"
76
+ _id, model_size_name = model_size.split("/")
77
+ model_dir_name = prefix + model_size_name
78
+ model_dir_path = os.path.join(self.model_dir, model_dir_name)
79
+ if os.path.exists(model_dir_path) and os.listdir(model_dir_path):
80
+ return True
81
+ return False
82
+
83
+
84
+ NLLB_AVAILABLE_LANGS = {
85
+ "Acehnese (Arabic script)": "ace_Arab",
86
+ "Acehnese (Latin script)": "ace_Latn",
87
+ "Mesopotamian Arabic": "acm_Arab",
88
+ "Ta’izzi-Adeni Arabic": "acq_Arab",
89
+ "Tunisian Arabic": "aeb_Arab",
90
+ "Afrikaans": "afr_Latn",
91
+ "South Levantine Arabic": "ajp_Arab",
92
+ "Akan": "aka_Latn",
93
+ "Amharic": "amh_Ethi",
94
+ "North Levantine Arabic": "apc_Arab",
95
+ "Modern Standard Arabic": "arb_Arab",
96
+ "Modern Standard Arabic (Romanized)": "arb_Latn",
97
+ "Najdi Arabic": "ars_Arab",
98
+ "Moroccan Arabic": "ary_Arab",
99
+ "Egyptian Arabic": "arz_Arab",
100
+ "Assamese": "asm_Beng",
101
+ "Asturian": "ast_Latn",
102
+ "Awadhi": "awa_Deva",
103
+ "Central Aymara": "ayr_Latn",
104
+ "South Azerbaijani": "azb_Arab",
105
+ "North Azerbaijani": "azj_Latn",
106
+ "Bashkir": "bak_Cyrl",
107
+ "Bambara": "bam_Latn",
108
+ "Balinese": "ban_Latn",
109
+ "Belarusian": "bel_Cyrl",
110
+ "Bemba": "bem_Latn",
111
+ "Bengali": "ben_Beng",
112
+ "Bhojpuri": "bho_Deva",
113
+ "Banjar (Arabic script)": "bjn_Arab",
114
+ "Banjar (Latin script)": "bjn_Latn",
115
+ "Standard Tibetan": "bod_Tibt",
116
+ "Bosnian": "bos_Latn",
117
+ "Buginese": "bug_Latn",
118
+ "Bulgarian": "bul_Cyrl",
119
+ "Catalan": "cat_Latn",
120
+ "Cebuano": "ceb_Latn",
121
+ "Czech": "ces_Latn",
122
+ "Chokwe": "cjk_Latn",
123
+ "Central Kurdish": "ckb_Arab",
124
+ "Crimean Tatar": "crh_Latn",
125
+ "Welsh": "cym_Latn",
126
+ "Danish": "dan_Latn",
127
+ "German": "deu_Latn",
128
+ "Southwestern Dinka": "dik_Latn",
129
+ "Dyula": "dyu_Latn",
130
+ "Dzongkha": "dzo_Tibt",
131
+ "Greek": "ell_Grek",
132
+ "English": "eng_Latn",
133
+ "Esperanto": "epo_Latn",
134
+ "Estonian": "est_Latn",
135
+ "Basque": "eus_Latn",
136
+ "Ewe": "ewe_Latn",
137
+ "Faroese": "fao_Latn",
138
+ "Fijian": "fij_Latn",
139
+ "Finnish": "fin_Latn",
140
+ "Fon": "fon_Latn",
141
+ "French": "fra_Latn",
142
+ "Friulian": "fur_Latn",
143
+ "Nigerian Fulfulde": "fuv_Latn",
144
+ "Scottish Gaelic": "gla_Latn",
145
+ "Irish": "gle_Latn",
146
+ "Galician": "glg_Latn",
147
+ "Guarani": "grn_Latn",
148
+ "Gujarati": "guj_Gujr",
149
+ "Haitian Creole": "hat_Latn",
150
+ "Hausa": "hau_Latn",
151
+ "Hebrew": "heb_Hebr",
152
+ "Hindi": "hin_Deva",
153
+ "Chhattisgarhi": "hne_Deva",
154
+ "Croatian": "hrv_Latn",
155
+ "Hungarian": "hun_Latn",
156
+ "Armenian": "hye_Armn",
157
+ "Igbo": "ibo_Latn",
158
+ "Ilocano": "ilo_Latn",
159
+ "Indonesian": "ind_Latn",
160
+ "Icelandic": "isl_Latn",
161
+ "Italian": "ita_Latn",
162
+ "Javanese": "jav_Latn",
163
+ "Japanese": "jpn_Jpan",
164
+ "Kabyle": "kab_Latn",
165
+ "Jingpho": "kac_Latn",
166
+ "Kamba": "kam_Latn",
167
+ "Kannada": "kan_Knda",
168
+ "Kashmiri (Arabic script)": "kas_Arab",
169
+ "Kashmiri (Devanagari script)": "kas_Deva",
170
+ "Georgian": "kat_Geor",
171
+ "Central Kanuri (Arabic script)": "knc_Arab",
172
+ "Central Kanuri (Latin script)": "knc_Latn",
173
+ "Kazakh": "kaz_Cyrl",
174
+ "Kabiyè": "kbp_Latn",
175
+ "Kabuverdianu": "kea_Latn",
176
+ "Khmer": "khm_Khmr",
177
+ "Kikuyu": "kik_Latn",
178
+ "Kinyarwanda": "kin_Latn",
179
+ "Kyrgyz": "kir_Cyrl",
180
+ "Kimbundu": "kmb_Latn",
181
+ "Northern Kurdish": "kmr_Latn",
182
+ "Kikongo": "kon_Latn",
183
+ "Korean": "kor_Hang",
184
+ "Lao": "lao_Laoo",
185
+ "Ligurian": "lij_Latn",
186
+ "Limburgish": "lim_Latn",
187
+ "Lingala": "lin_Latn",
188
+ "Lithuanian": "lit_Latn",
189
+ "Lombard": "lmo_Latn",
190
+ "Latgalian": "ltg_Latn",
191
+ "Luxembourgish": "ltz_Latn",
192
+ "Luba-Kasai": "lua_Latn",
193
+ "Ganda": "lug_Latn",
194
+ "Luo": "luo_Latn",
195
+ "Mizo": "lus_Latn",
196
+ "Standard Latvian": "lvs_Latn",
197
+ "Magahi": "mag_Deva",
198
+ "Maithili": "mai_Deva",
199
+ "Malayalam": "mal_Mlym",
200
+ "Marathi": "mar_Deva",
201
+ "Minangkabau (Arabic script)": "min_Arab",
202
+ "Minangkabau (Latin script)": "min_Latn",
203
+ "Macedonian": "mkd_Cyrl",
204
+ "Plateau Malagasy": "plt_Latn",
205
+ "Maltese": "mlt_Latn",
206
+ "Meitei (Bengali script)": "mni_Beng",
207
+ "Halh Mongolian": "khk_Cyrl",
208
+ "Mossi": "mos_Latn",
209
+ "Maori": "mri_Latn",
210
+ "Burmese": "mya_Mymr",
211
+ "Dutch": "nld_Latn",
212
+ "Norwegian Nynorsk": "nno_Latn",
213
+ "Norwegian Bokmål": "nob_Latn",
214
+ "Nepali": "npi_Deva",
215
+ "Northern Sotho": "nso_Latn",
216
+ "Nuer": "nus_Latn",
217
+ "Nyanja": "nya_Latn",
218
+ "Occitan": "oci_Latn",
219
+ "West Central Oromo": "gaz_Latn",
220
+ "Odia": "ory_Orya",
221
+ "Pangasinan": "pag_Latn",
222
+ "Eastern Panjabi": "pan_Guru",
223
+ "Papiamento": "pap_Latn",
224
+ "Western Persian": "pes_Arab",
225
+ "Polish": "pol_Latn",
226
+ "Portuguese": "por_Latn",
227
+ "Dari": "prs_Arab",
228
+ "Southern Pashto": "pbt_Arab",
229
+ "Ayacucho Quechua": "quy_Latn",
230
+ "Romanian": "ron_Latn",
231
+ "Rundi": "run_Latn",
232
+ "Russian": "rus_Cyrl",
233
+ "Sango": "sag_Latn",
234
+ "Sanskrit": "san_Deva",
235
+ "Santali": "sat_Olck",
236
+ "Sicilian": "scn_Latn",
237
+ "Shan": "shn_Mymr",
238
+ "Sinhala": "sin_Sinh",
239
+ "Slovak": "slk_Latn",
240
+ "Slovenian": "slv_Latn",
241
+ "Samoan": "smo_Latn",
242
+ "Shona": "sna_Latn",
243
+ "Sindhi": "snd_Arab",
244
+ "Somali": "som_Latn",
245
+ "Southern Sotho": "sot_Latn",
246
+ "Spanish": "spa_Latn",
247
+ "Tosk Albanian": "als_Latn",
248
+ "Sardinian": "srd_Latn",
249
+ "Serbian": "srp_Cyrl",
250
+ "Swati": "ssw_Latn",
251
+ "Sundanese": "sun_Latn",
252
+ "Swedish": "swe_Latn",
253
+ "Swahili": "swh_Latn",
254
+ "Silesian": "szl_Latn",
255
+ "Tamil": "tam_Taml",
256
+ "Tatar": "tat_Cyrl",
257
+ "Telugu": "tel_Telu",
258
+ "Tajik": "tgk_Cyrl",
259
+ "Tagalog": "tgl_Latn",
260
+ "Thai": "tha_Thai",
261
+ "Tigrinya": "tir_Ethi",
262
+ "Tamasheq (Latin script)": "taq_Latn",
263
+ "Tamasheq (Tifinagh script)": "taq_Tfng",
264
+ "Tok Pisin": "tpi_Latn",
265
+ "Tswana": "tsn_Latn",
266
+ "Tsonga": "tso_Latn",
267
+ "Turkmen": "tuk_Latn",
268
+ "Tumbuka": "tum_Latn",
269
+ "Turkish": "tur_Latn",
270
+ "Twi": "twi_Latn",
271
+ "Central Atlas Tamazight": "tzm_Tfng",
272
+ "Uyghur": "uig_Arab",
273
+ "Ukrainian": "ukr_Cyrl",
274
+ "Umbundu": "umb_Latn",
275
+ "Urdu": "urd_Arab",
276
+ "Northern Uzbek": "uzn_Latn",
277
+ "Venetian": "vec_Latn",
278
+ "Vietnamese": "vie_Latn",
279
+ "Waray": "war_Latn",
280
+ "Wolof": "wol_Latn",
281
+ "Xhosa": "xho_Latn",
282
+ "Eastern Yiddish": "ydd_Hebr",
283
+ "Yoruba": "yor_Latn",
284
+ "Yue Chinese": "yue_Hant",
285
+ "Chinese (Simplified)": "zho_Hans",
286
+ "Chinese (Traditional)": "zho_Hant",
287
+ "Standard Malay": "zsm_Latn",
288
+ "Zulu": "zul_Latn",
289
+ }
modules/translation/translation_base.py ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import gradio as gr
4
+ from abc import ABC, abstractmethod
5
+ from typing import List
6
+ from datetime import datetime
7
+ import spaces
8
+
9
+ import modules.translation.nllb_inference as nllb
10
+ from modules.whisper.data_classes import *
11
+ from modules.utils.subtitle_manager import *
12
+ from modules.utils.files_manager import load_yaml, save_yaml
13
+ from modules.utils.paths import DEFAULT_PARAMETERS_CONFIG_PATH, NLLB_MODELS_DIR, TRANSLATION_OUTPUT_DIR
14
+
15
+
16
+ class TranslationBase(ABC):
17
+ def __init__(self,
18
+ model_dir: str = NLLB_MODELS_DIR,
19
+ output_dir: str = TRANSLATION_OUTPUT_DIR
20
+ ):
21
+ super().__init__()
22
+ self.model = None
23
+ self.model_dir = model_dir
24
+ self.output_dir = output_dir
25
+ os.makedirs(self.model_dir, exist_ok=True)
26
+ os.makedirs(self.output_dir, exist_ok=True)
27
+ self.current_model_size = None
28
+ self.device = self.get_device()
29
+
30
+ @abstractmethod
31
+ @spaces.GPU(duration=120)
32
+ def translate(self,
33
+ text: str,
34
+ max_length: int
35
+ ):
36
+ pass
37
+
38
+ @abstractmethod
39
+ @spaces.GPU(duration=120)
40
+ def update_model(self,
41
+ model_size: str,
42
+ src_lang: str,
43
+ tgt_lang: str,
44
+ progress: gr.Progress = gr.Progress()
45
+ ):
46
+ pass
47
+
48
+ @spaces.GPU(duration=120)
49
+ def translate_file(self,
50
+ fileobjs: list,
51
+ model_size: str,
52
+ src_lang: str,
53
+ tgt_lang: str,
54
+ max_length: int = 200,
55
+ add_timestamp: bool = True,
56
+ progress=gr.Progress()) -> list:
57
+ """
58
+ Translate subtitle file from source language to target language
59
+
60
+ Parameters
61
+ ----------
62
+ fileobjs: list
63
+ List of files to transcribe from gr.Files()
64
+ model_size: str
65
+ Whisper model size from gr.Dropdown()
66
+ src_lang: str
67
+ Source language of the file to translate from gr.Dropdown()
68
+ tgt_lang: str
69
+ Target language of the file to translate from gr.Dropdown()
70
+ max_length: int
71
+ Max length per line to translate
72
+ add_timestamp: bool
73
+ Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename.
74
+ progress: gr.Progress
75
+ Indicator to show progress directly in gradio.
76
+ I use a forked version of whisper for this. To see more info : https://github.com/jhj0517/jhj0517-whisper/tree/add-progress-callback
77
+
78
+ Returns
79
+ ----------
80
+ A List of
81
+ String to return to gr.Textbox()
82
+ Files to return to gr.Files()
83
+ """
84
+ try:
85
+ if fileobjs and isinstance(fileobjs[0], gr.utils.NamedString):
86
+ fileobjs = [file.name for file in fileobjs]
87
+
88
+ self.cache_parameters(model_size=model_size,
89
+ src_lang=src_lang,
90
+ tgt_lang=tgt_lang,
91
+ max_length=max_length,
92
+ add_timestamp=add_timestamp)
93
+
94
+ self.update_model(model_size=model_size,
95
+ src_lang=src_lang,
96
+ tgt_lang=tgt_lang,
97
+ progress=progress)
98
+
99
+ files_info = {}
100
+ for fileobj in fileobjs:
101
+ file_name, file_ext = os.path.splitext(os.path.basename(fileobj))
102
+ writer = get_writer(file_ext, self.output_dir)
103
+ segments = writer.to_segments(fileobj)
104
+ for i, segment in enumerate(segments):
105
+ progress(i / len(segments), desc="Translating..")
106
+ translated_text = self.translate(segment.text, max_length=max_length)
107
+ segment.text = translated_text
108
+
109
+ subtitle, file_path = generate_file(
110
+ output_dir=self.output_dir,
111
+ output_file_name=file_name,
112
+ output_format=file_ext,
113
+ result=segments,
114
+ add_timestamp=add_timestamp
115
+ )
116
+
117
+ files_info[file_name] = {"subtitle": subtitle, "path": file_path}
118
+
119
+ total_result = ''
120
+ for file_name, info in files_info.items():
121
+ total_result += '------------------------------------\n'
122
+ total_result += f'{file_name}\n\n'
123
+ total_result += f'{info["subtitle"]}'
124
+ gr_str = f"Done! Subtitle is in the outputs/translation folder.\n\n{total_result}"
125
+
126
+ output_file_paths = [item["path"] for key, item in files_info.items()]
127
+ return [gr_str, output_file_paths]
128
+
129
+ except Exception as e:
130
+ print(f"Error translating file: {e}")
131
+ raise
132
+ finally:
133
+ self.release_cuda_memory()
134
+
135
+ @staticmethod
136
+ @spaces.GPU(duration=120)
137
+ def get_device():
138
+ if torch.cuda.is_available():
139
+ return "cuda"
140
+ elif torch.backends.mps.is_available():
141
+ return "mps"
142
+ else:
143
+ return "cpu"
144
+
145
+ @staticmethod
146
+ @spaces.GPU(duration=120)
147
+ def release_cuda_memory():
148
+ if torch.cuda.is_available():
149
+ torch.cuda.empty_cache()
150
+ torch.cuda.reset_max_memory_allocated()
151
+
152
+ @staticmethod
153
+ def remove_input_files(file_paths: List[str]):
154
+ if not file_paths:
155
+ return
156
+
157
+ for file_path in file_paths:
158
+ if file_path and os.path.exists(file_path):
159
+ os.remove(file_path)
160
+
161
+ @staticmethod
162
+ def cache_parameters(model_size: str,
163
+ src_lang: str,
164
+ tgt_lang: str,
165
+ max_length: int,
166
+ add_timestamp: bool):
167
+ def validate_lang(lang: str):
168
+ if lang in list(nllb.NLLB_AVAILABLE_LANGS.values()):
169
+ flipped = {value: key for key, value in nllb.NLLB_AVAILABLE_LANGS.items()}
170
+ return flipped[lang]
171
+ return lang
172
+
173
+ cached_params = load_yaml(DEFAULT_PARAMETERS_CONFIG_PATH)
174
+ cached_params["translation"]["nllb"] = {
175
+ "model_size": model_size,
176
+ "source_lang": validate_lang(src_lang),
177
+ "target_lang": validate_lang(tgt_lang),
178
+ "max_length": max_length,
179
+ }
180
+ cached_params["translation"]["add_timestamp"] = add_timestamp
181
+ save_yaml(cached_params, DEFAULT_PARAMETERS_CONFIG_PATH)
modules/ui/__init__.py ADDED
File without changes
modules/ui/htmls.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ CSS = """
2
+ .bmc-button {
3
+ padding: 2px 5px;
4
+ border-radius: 5px;
5
+ background-color: #FF813F;
6
+ color: white;
7
+ box-shadow: 0px 1px 2px rgba(0, 0, 0, 0.3);
8
+ text-decoration: none;
9
+ display: inline-block;
10
+ font-size: 20px;
11
+ margin: 2px;
12
+ cursor: pointer;
13
+ -webkit-transition: background-color 0.3s ease;
14
+ -ms-transition: background-color 0.3s ease;
15
+ transition: background-color 0.3s ease;
16
+ }
17
+ .bmc-button:hover,
18
+ .bmc-button:active,
19
+ .bmc-button:focus {
20
+ background-color: #FF5633;
21
+ }
22
+ .markdown {
23
+ margin-bottom: 0;
24
+ padding-bottom: 0;
25
+ }
26
+ .tabs {
27
+ margin-top: 0;
28
+ padding-top: 0;
29
+ }
30
+
31
+ #md_project a {
32
+ color: black;
33
+ text-decoration: none;
34
+ }
35
+ #md_project a:hover {
36
+ text-decoration: underline;
37
+ }
38
+ """
39
+
40
+ MARKDOWN = """
41
+ ### [Whisper-WebUI](https://github.com/jhj0517/Whsiper-WebUI)
42
+ """
43
+
44
+
45
+ NLLB_VRAM_TABLE = """
46
+ <!DOCTYPE html>
47
+ <html lang="en">
48
+ <head>
49
+ <meta charset="UTF-8">
50
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
51
+ <style>
52
+ table {
53
+ border-collapse: collapse;
54
+ width: 100%;
55
+ }
56
+ th, td {
57
+ border: 1px solid #dddddd;
58
+ text-align: left;
59
+ padding: 8px;
60
+ }
61
+ th {
62
+ background-color: #f2f2f2;
63
+ }
64
+ </style>
65
+ </head>
66
+ <body>
67
+
68
+ <details>
69
+ <summary>VRAM usage for each model</summary>
70
+ <table>
71
+ <thead>
72
+ <tr>
73
+ <th>Model name</th>
74
+ <th>Required VRAM</th>
75
+ </tr>
76
+ </thead>
77
+ <tbody>
78
+ <tr>
79
+ <td>nllb-200-3.3B</td>
80
+ <td>~16GB</td>
81
+ </tr>
82
+ <tr>
83
+ <td>nllb-200-1.3B</td>
84
+ <td>~8GB</td>
85
+ </tr>
86
+ <tr>
87
+ <td>nllb-200-distilled-600M</td>
88
+ <td>~4GB</td>
89
+ </tr>
90
+ </tbody>
91
+ </table>
92
+ <p><strong>Note:</strong> Be mindful of your VRAM! The table above provides an approximate VRAM usage for each model.</p>
93
+ </details>
94
+
95
+ </body>
96
+ </html>
97
+ """
modules/utils/__init__.py ADDED
File without changes
modules/utils/cli_manager.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+
3
+
4
+ def str2bool(v):
5
+ if isinstance(v, bool):
6
+ return v
7
+ if v.lower() in ('yes', 'true', 't', 'y', '1'):
8
+ return True
9
+ elif v.lower() in ('no', 'false', 'f', 'n', '0'):
10
+ return False
11
+ else:
12
+ raise argparse.ArgumentTypeError('Boolean value expected.')
modules/utils/constants.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from gradio_i18n import Translate, gettext as _
2
+
3
+ AUTOMATIC_DETECTION = _("Automatic Detection")
4
+ GRADIO_NONE_STR = ""
5
+ GRADIO_NONE_NUMBER_MAX = 9999
6
+ GRADIO_NONE_NUMBER_MIN = 0
modules/utils/files_manager.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import fnmatch
3
+ from ruamel.yaml import YAML
4
+ from gradio.utils import NamedString
5
+
6
+ from modules.utils.paths import DEFAULT_PARAMETERS_CONFIG_PATH
7
+
8
+
9
+ def load_yaml(path: str = DEFAULT_PARAMETERS_CONFIG_PATH):
10
+ yaml = YAML(typ="safe")
11
+ yaml.preserve_quotes = True
12
+ with open(path, 'r', encoding='utf-8') as file:
13
+ config = yaml.load(file)
14
+ return config
15
+
16
+
17
+ def save_yaml(data: dict, path: str = DEFAULT_PARAMETERS_CONFIG_PATH):
18
+ yaml = YAML(typ="safe")
19
+ yaml.map_indent = 2
20
+ yaml.sequence_indent = 4
21
+ yaml.sequence_dash_offset = 2
22
+ yaml.preserve_quotes = True
23
+ yaml.default_flow_style = False
24
+ yaml.sort_base_mapping_type_on_output = False
25
+
26
+ with open(path, 'w', encoding='utf-8') as file:
27
+ yaml.dump(data, file)
28
+ return path
29
+
30
+
31
+ def get_media_files(folder_path, include_sub_directory=False):
32
+ video_extensions = ['*.mp4', '*.mkv', '*.flv', '*.avi', '*.mov', '*.wmv', '*.webm', '*.m4v', '*.mpeg', '*.mpg',
33
+ '*.3gp', '*.f4v', '*.ogv', '*.vob', '*.mts', '*.m2ts', '*.divx', '*.mxf', '*.rm', '*.rmvb']
34
+ audio_extensions = ['*.mp3', '*.wav', '*.aac', '*.flac', '*.ogg', '*.m4a']
35
+ media_extensions = video_extensions + audio_extensions
36
+
37
+ media_files = []
38
+
39
+ if include_sub_directory:
40
+ for root, _, files in os.walk(folder_path):
41
+ for extension in media_extensions:
42
+ media_files.extend(
43
+ os.path.join(root, file) for file in fnmatch.filter(files, extension)
44
+ if os.path.exists(os.path.join(root, file))
45
+ )
46
+ else:
47
+ for extension in media_extensions:
48
+ media_files.extend(
49
+ os.path.join(folder_path, file) for file in fnmatch.filter(os.listdir(folder_path), extension)
50
+ if os.path.isfile(os.path.join(folder_path, file)) and os.path.exists(os.path.join(folder_path, file))
51
+ )
52
+
53
+ return media_files
54
+
55
+
56
+ def format_gradio_files(files: list):
57
+ if not files:
58
+ return files
59
+
60
+ gradio_files = []
61
+ for file in files:
62
+ gradio_files.append(NamedString(file))
63
+ return gradio_files
64
+
65
+
66
+ def is_video(file_path):
67
+ video_extensions = ['.mp4', '.mkv', '.avi', '.mov', '.flv', '.wmv', '.webm', '.m4v', '.mpeg', '.mpg', '.3gp']
68
+ extension = os.path.splitext(file_path)[1].lower()
69
+ return extension in video_extensions
70
+
71
+
72
+ def read_file(file_path):
73
+ with open(file_path, "r", encoding="utf-8") as f:
74
+ subtitle_content = f.read()
75
+ return subtitle_content
modules/utils/paths.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ WEBUI_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
4
+ MODELS_DIR = os.path.join(WEBUI_DIR, "models")
5
+ WHISPER_MODELS_DIR = os.path.join(MODELS_DIR, "Whisper")
6
+ FASTER_WHISPER_MODELS_DIR = os.path.join(WHISPER_MODELS_DIR, "faster-whisper")
7
+ INSANELY_FAST_WHISPER_MODELS_DIR = os.path.join(WHISPER_MODELS_DIR, "insanely-fast-whisper")
8
+ NLLB_MODELS_DIR = os.path.join(MODELS_DIR, "NLLB")
9
+ DIARIZATION_MODELS_DIR = os.path.join(MODELS_DIR, "Diarization")
10
+ UVR_MODELS_DIR = os.path.join(MODELS_DIR, "UVR", "MDX_Net_Models")
11
+ CONFIGS_DIR = os.path.join(WEBUI_DIR, "configs")
12
+ DEFAULT_PARAMETERS_CONFIG_PATH = os.path.join(CONFIGS_DIR, "default_parameters.yaml")
13
+ I18N_YAML_PATH = os.path.join(CONFIGS_DIR, "translation.yaml")
14
+ OUTPUT_DIR = os.path.join(WEBUI_DIR, "outputs")
15
+ TRANSLATION_OUTPUT_DIR = os.path.join(OUTPUT_DIR, "translations")
16
+ UVR_OUTPUT_DIR = os.path.join(OUTPUT_DIR, "UVR")
17
+ UVR_INSTRUMENTAL_OUTPUT_DIR = os.path.join(UVR_OUTPUT_DIR, "instrumental")
18
+ UVR_VOCALS_OUTPUT_DIR = os.path.join(UVR_OUTPUT_DIR, "vocals")
19
+
20
+ for dir_path in [MODELS_DIR,
21
+ WHISPER_MODELS_DIR,
22
+ FASTER_WHISPER_MODELS_DIR,
23
+ INSANELY_FAST_WHISPER_MODELS_DIR,
24
+ NLLB_MODELS_DIR,
25
+ DIARIZATION_MODELS_DIR,
26
+ UVR_MODELS_DIR,
27
+ CONFIGS_DIR,
28
+ OUTPUT_DIR,
29
+ TRANSLATION_OUTPUT_DIR,
30
+ UVR_INSTRUMENTAL_OUTPUT_DIR,
31
+ UVR_VOCALS_OUTPUT_DIR]:
32
+ os.makedirs(dir_path, exist_ok=True)
modules/utils/subtitle_manager.py ADDED
@@ -0,0 +1,438 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Ported from https://github.com/openai/whisper/blob/main/whisper/utils.py
2
+
3
+ import json
4
+ import os
5
+ import re
6
+ import sys
7
+ import zlib
8
+ from typing import Callable, List, Optional, TextIO, Union, Dict, Tuple
9
+ from datetime import datetime
10
+
11
+ from modules.whisper.data_classes import Segment, Word
12
+ from .files_manager import read_file
13
+
14
+ # Zero GPU
15
+ import spaces
16
+
17
+ def format_timestamp(
18
+ seconds: float, always_include_hours: bool = True, decimal_marker: str = ","
19
+ ) -> str:
20
+ assert seconds >= 0, "non-negative timestamp expected"
21
+ milliseconds = round(seconds * 1000.0)
22
+
23
+ hours = milliseconds // 3_600_000
24
+ milliseconds -= hours * 3_600_000
25
+
26
+ minutes = milliseconds // 60_000
27
+ milliseconds -= minutes * 60_000
28
+
29
+ seconds = milliseconds // 1_000
30
+ milliseconds -= seconds * 1_000
31
+
32
+ hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else ""
33
+ return (
34
+ f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}"
35
+ )
36
+
37
+
38
+ def time_str_to_seconds(time_str: str, decimal_marker: str = ",") -> float:
39
+ times = time_str.split(":")
40
+
41
+ if len(times) == 3:
42
+ hours, minutes, rest = times
43
+ hours = int(hours)
44
+ else:
45
+ hours = 0
46
+ minutes, rest = times
47
+
48
+ seconds, fractional = rest.split(decimal_marker)
49
+
50
+ minutes = int(minutes)
51
+ seconds = int(seconds)
52
+ fractional_seconds = float("0." + fractional)
53
+
54
+ return hours * 3600 + minutes * 60 + seconds + fractional_seconds
55
+
56
+
57
+ def get_start(segments: List[dict]) -> Optional[float]:
58
+ return next(
59
+ (w["start"] for s in segments for w in s["words"]),
60
+ segments[0]["start"] if segments else None,
61
+ )
62
+
63
+
64
+ def get_end(segments: List[dict]) -> Optional[float]:
65
+ return next(
66
+ (w["end"] for s in reversed(segments) for w in reversed(s["words"])),
67
+ segments[-1]["end"] if segments else None,
68
+ )
69
+
70
+
71
+ class ResultWriter:
72
+ extension: str
73
+
74
+ def __init__(self, output_dir: str):
75
+ self.output_dir = output_dir
76
+
77
+ def __call__(
78
+ self, result: Union[dict, List[Segment]], output_file_name: str,
79
+ options: Optional[dict] = None, **kwargs
80
+ ):
81
+ if isinstance(result, List) and result and isinstance(result[0], Segment):
82
+ result = {"segments": [seg.model_dump() for seg in result]}
83
+
84
+ output_path = os.path.join(
85
+ self.output_dir, output_file_name + "." + self.extension
86
+ )
87
+
88
+ with open(output_path, "w", encoding="utf-8") as f:
89
+ self.write_result(result, file=f, options=options, **kwargs)
90
+
91
+ def write_result(
92
+ self, result: dict, file: TextIO, options: Optional[dict] = None, **kwargs
93
+ ):
94
+ raise NotImplementedError
95
+
96
+
97
+ class WriteTXT(ResultWriter):
98
+ extension: str = "txt"
99
+
100
+ def write_result(
101
+ self, result: Union[Dict, List[Segment]], file: TextIO, options: Optional[dict] = None, **kwargs
102
+ ):
103
+ for segment in result["segments"]:
104
+ print(segment["text"].strip(), file=file, flush=True)
105
+
106
+
107
+ class SubtitlesWriter(ResultWriter):
108
+ always_include_hours: bool
109
+ decimal_marker: str
110
+
111
+ def iterate_result(
112
+ self,
113
+ result: dict,
114
+ options: Optional[dict] = None,
115
+ *,
116
+ max_line_width: Optional[int] = None,
117
+ max_line_count: Optional[int] = None,
118
+ highlight_words: bool = False,
119
+ align_lrc_words: bool = False,
120
+ max_words_per_line: Optional[int] = None,
121
+ ):
122
+ options = options or {}
123
+ max_line_width = max_line_width or options.get("max_line_width")
124
+ max_line_count = max_line_count or options.get("max_line_count")
125
+ highlight_words = highlight_words or options.get("highlight_words", False)
126
+ align_lrc_words = align_lrc_words or options.get("align_lrc_words", False)
127
+ max_words_per_line = max_words_per_line or options.get("max_words_per_line")
128
+ preserve_segments = max_line_count is None or max_line_width is None
129
+ max_line_width = max_line_width or 1000
130
+ max_words_per_line = max_words_per_line or 1000
131
+
132
+ def iterate_subtitles():
133
+ line_len = 0
134
+ line_count = 1
135
+ # the next subtitle to yield (a list of word timings with whitespace)
136
+ subtitle: List[dict] = []
137
+ last: float = get_start(result["segments"]) or 0.0
138
+ for segment in result["segments"]:
139
+ chunk_index = 0
140
+ words_count = max_words_per_line
141
+ while chunk_index < len(segment["words"]):
142
+ remaining_words = len(segment["words"]) - chunk_index
143
+ if max_words_per_line > len(segment["words"]) - chunk_index:
144
+ words_count = remaining_words
145
+ for i, original_timing in enumerate(
146
+ segment["words"][chunk_index : chunk_index + words_count]
147
+ ):
148
+ timing = original_timing.copy()
149
+ long_pause = (
150
+ not preserve_segments and timing["start"] - last > 3.0
151
+ )
152
+ has_room = line_len + len(timing["word"]) <= max_line_width
153
+ seg_break = i == 0 and len(subtitle) > 0 and preserve_segments
154
+ if (
155
+ line_len > 0
156
+ and has_room
157
+ and not long_pause
158
+ and not seg_break
159
+ ):
160
+ # line continuation
161
+ line_len += len(timing["word"])
162
+ else:
163
+ # new line
164
+ timing["word"] = timing["word"].strip()
165
+ if (
166
+ len(subtitle) > 0
167
+ and max_line_count is not None
168
+ and (long_pause or line_count >= max_line_count)
169
+ or seg_break
170
+ ):
171
+ # subtitle break
172
+ yield subtitle
173
+ subtitle = []
174
+ line_count = 1
175
+ elif line_len > 0:
176
+ # line break
177
+ line_count += 1
178
+ timing["word"] = "\n" + timing["word"]
179
+ line_len = len(timing["word"].strip())
180
+ subtitle.append(timing)
181
+ last = timing["start"]
182
+ chunk_index += max_words_per_line
183
+ if len(subtitle) > 0:
184
+ yield subtitle
185
+
186
+ if len(result["segments"]) > 0 and "words" in result["segments"][0] and result["segments"][0]["words"]:
187
+ for subtitle in iterate_subtitles():
188
+ subtitle_start = self.format_timestamp(subtitle[0]["start"])
189
+ subtitle_end = self.format_timestamp(subtitle[-1]["end"])
190
+ subtitle_text = "".join([word["word"] for word in subtitle])
191
+ if highlight_words:
192
+ last = subtitle_start
193
+ all_words = [timing["word"] for timing in subtitle]
194
+ for i, this_word in enumerate(subtitle):
195
+ start = self.format_timestamp(this_word["start"])
196
+ end = self.format_timestamp(this_word["end"])
197
+ if last != start:
198
+ yield last, start, subtitle_text
199
+
200
+ yield start, end, "".join(
201
+ [
202
+ re.sub(r"^(\s*)(.*)$", r"\1<u>\2</u>", word)
203
+ if j == i
204
+ else word
205
+ for j, word in enumerate(all_words)
206
+ ]
207
+ )
208
+ last = end
209
+
210
+ if align_lrc_words:
211
+ lrc_aligned_words = [f"[{self.format_timestamp(sub['start'])}]{sub['word']}" for sub in subtitle]
212
+ l_start, l_end = self.format_timestamp(subtitle[-1]['start']), self.format_timestamp(subtitle[-1]['end'])
213
+ lrc_aligned_words[-1] = f"[{l_start}]{subtitle[-1]['word']}[{l_end}]"
214
+ lrc_aligned_words = ' '.join(lrc_aligned_words)
215
+ yield None, None, lrc_aligned_words
216
+
217
+ else:
218
+ yield subtitle_start, subtitle_end, subtitle_text
219
+ else:
220
+ for segment in result["segments"]:
221
+ segment_start = self.format_timestamp(segment["start"])
222
+ segment_end = self.format_timestamp(segment["end"])
223
+ segment_text = segment["text"].strip().replace("-->", "->")
224
+ yield segment_start, segment_end, segment_text
225
+
226
+ def format_timestamp(self, seconds: float):
227
+ return format_timestamp(
228
+ seconds=seconds,
229
+ always_include_hours=self.always_include_hours,
230
+ decimal_marker=self.decimal_marker,
231
+ )
232
+
233
+
234
+ class WriteVTT(SubtitlesWriter):
235
+ extension: str = "vtt"
236
+ always_include_hours: bool = False
237
+ decimal_marker: str = "."
238
+
239
+ def write_result(
240
+ self, result: dict, file: TextIO, options: Optional[dict] = None, **kwargs
241
+ ):
242
+ print("WEBVTT\n", file=file)
243
+ for start, end, text in self.iterate_result(result, options, **kwargs):
244
+ print(f"{start} --> {end}\n{text}\n", file=file, flush=True)
245
+
246
+ def to_segments(self, file_path: str) -> List[Segment]:
247
+ segments = []
248
+
249
+ blocks = read_file(file_path).split('\n\n')
250
+
251
+ for block in blocks:
252
+ if block.strip() != '' and not block.strip().startswith("WEBVTT"):
253
+ lines = block.strip().split('\n')
254
+ time_line = lines[0].split(" --> ")
255
+ start, end = time_str_to_seconds(time_line[0], self.decimal_marker), time_str_to_seconds(time_line[1], self.decimal_marker)
256
+ sentence = ' '.join(lines[1:])
257
+
258
+ segments.append(Segment(
259
+ start=start,
260
+ end=end,
261
+ text=sentence
262
+ ))
263
+
264
+ return segments
265
+
266
+
267
+ class WriteSRT(SubtitlesWriter):
268
+ extension: str = "srt"
269
+ always_include_hours: bool = True
270
+ decimal_marker: str = ","
271
+
272
+ def write_result(
273
+ self, result: dict, file: TextIO, options: Optional[dict] = None, **kwargs
274
+ ):
275
+ for i, (start, end, text) in enumerate(
276
+ self.iterate_result(result, options, **kwargs), start=1
277
+ ):
278
+ print(f"{i}\n{start} --> {end}\n{text}\n", file=file, flush=True)
279
+
280
+ def to_segments(self, file_path: str) -> List[Segment]:
281
+ segments = []
282
+
283
+ blocks = read_file(file_path).split('\n\n')
284
+
285
+ for block in blocks:
286
+ if block.strip() != '':
287
+ lines = block.strip().split('\n')
288
+ index = lines[0]
289
+ time_line = lines[1].split(" --> ")
290
+ start, end = time_str_to_seconds(time_line[0], self.decimal_marker), time_str_to_seconds(time_line[1], self.decimal_marker)
291
+ sentence = ' '.join(lines[2:])
292
+
293
+ segments.append(Segment(
294
+ start=start,
295
+ end=end,
296
+ text=sentence
297
+ ))
298
+
299
+ return segments
300
+
301
+
302
+ class WriteLRC(SubtitlesWriter):
303
+ extension: str = "lrc"
304
+ always_include_hours: bool = False
305
+ decimal_marker: str = "."
306
+
307
+ def write_result(
308
+ self, result: dict, file: TextIO, options: Optional[dict] = None, **kwargs
309
+ ):
310
+ for i, (start, end, text) in enumerate(
311
+ self.iterate_result(result, options, **kwargs), start=1
312
+ ):
313
+ if "align_lrc_words" in kwargs and kwargs["align_lrc_words"]:
314
+ print(f"{text}\n", file=file, flush=True)
315
+ else:
316
+ print(f"[{start}]{text}[{end}]\n", file=file, flush=True)
317
+
318
+ def to_segments(self, file_path: str) -> List[Segment]:
319
+ segments = []
320
+
321
+ blocks = read_file(file_path).split('\n')
322
+
323
+ for block in blocks:
324
+ if block.strip() != '':
325
+ lines = block.strip()
326
+ pattern = r'(\[.*?\])'
327
+ parts = re.split(pattern, lines)
328
+ parts = [part.strip() for part in parts if part]
329
+
330
+ for i, part in enumerate(parts):
331
+ sentence_i = i%2
332
+ if sentence_i == 1:
333
+ start_str, text, end_str = parts[sentence_i-1], parts[sentence_i], parts[sentence_i+1]
334
+ start_str, end_str = start_str.replace("[", "").replace("]", ""), end_str.replace("[", "").replace("]", "")
335
+ start, end = time_str_to_seconds(start_str, self.decimal_marker), time_str_to_seconds(end_str, self.decimal_marker)
336
+
337
+ segments.append(Segment(
338
+ start=start,
339
+ end=end,
340
+ text=text,
341
+ ))
342
+
343
+ return segments
344
+
345
+
346
+ class WriteTSV(ResultWriter):
347
+ """
348
+ Write a transcript to a file in TSV (tab-separated values) format containing lines like:
349
+ <start time in integer milliseconds>\t<end time in integer milliseconds>\t<transcript text>
350
+
351
+ Using integer milliseconds as start and end times means there's no chance of interference from
352
+ an environment setting a language encoding that causes the decimal in a floating point number
353
+ to appear as a comma; also is faster and more efficient to parse & store, e.g., in C++.
354
+ """
355
+
356
+ extension: str = "tsv"
357
+
358
+ def write_result(
359
+ self, result: dict, file: TextIO, options: Optional[dict] = None, **kwargs
360
+ ):
361
+ print("start", "end", "text", sep="\t", file=file)
362
+ for segment in result["segments"]:
363
+ print(round(1000 * segment["start"]), file=file, end="\t")
364
+ print(round(1000 * segment["end"]), file=file, end="\t")
365
+ print(segment["text"].strip().replace("\t", " "), file=file, flush=True)
366
+
367
+
368
+ class WriteJSON(ResultWriter):
369
+ extension: str = "json"
370
+
371
+ def write_result(
372
+ self, result: dict, file: TextIO, options: Optional[dict] = None, **kwargs
373
+ ):
374
+ json.dump(result, file)
375
+
376
+
377
+ def get_writer(
378
+ output_format: str, output_dir: str
379
+ ) -> Callable[[dict, TextIO, dict], None]:
380
+ output_format = output_format.strip().lower().replace(".", "")
381
+
382
+ writers = {
383
+ "txt": WriteTXT,
384
+ "vtt": WriteVTT,
385
+ "srt": WriteSRT,
386
+ "tsv": WriteTSV,
387
+ "json": WriteJSON,
388
+ "lrc": WriteLRC
389
+ }
390
+
391
+ if output_format == "all":
392
+ all_writers = [writer(output_dir) for writer in writers.values()]
393
+
394
+ def write_all(
395
+ result: dict, file: TextIO, options: Optional[dict] = None, **kwargs
396
+ ):
397
+ for writer in all_writers:
398
+ writer(result, file, options, **kwargs)
399
+
400
+ return write_all
401
+
402
+ return writers[output_format](output_dir)
403
+
404
+
405
+ def generate_file(
406
+ output_format: str, output_dir: str, result: Union[dict, List[Segment]], output_file_name: str,
407
+ add_timestamp: bool = True, **kwargs
408
+ ) -> Tuple[str, str]:
409
+ output_format = output_format.strip().lower().replace(".", "")
410
+ output_format = "vtt" if output_format == "webvtt" else output_format
411
+
412
+ if add_timestamp:
413
+ timestamp = datetime.now().strftime("%m%d%H%M%S")
414
+ output_file_name += f"-{timestamp}"
415
+
416
+ file_path = os.path.join(output_dir, f"{output_file_name}.{output_format}")
417
+ file_writer = get_writer(output_format=output_format, output_dir=output_dir)
418
+
419
+ if isinstance(file_writer, WriteLRC) and kwargs.get("highlight_words", False):
420
+ kwargs["highlight_words"], kwargs["align_lrc_words"] = False, True
421
+
422
+ file_writer(result=result, output_file_name=output_file_name, **kwargs)
423
+ content = read_file(file_path)
424
+ return content, file_path
425
+
426
+ @spaces.GPU(duration=120)
427
+ def safe_filename(name):
428
+ INVALID_FILENAME_CHARS = r'[<>:"/\\|?*\x00-\x1f]'
429
+ safe_name = re.sub(INVALID_FILENAME_CHARS, '_', name)
430
+ # Truncate the filename if it exceeds the max_length (20)
431
+ if len(safe_name) > 20:
432
+ file_extension = safe_name.split('.')[-1]
433
+ if len(file_extension) + 1 < 20:
434
+ truncated_name = safe_name[:20 - len(file_extension) - 1]
435
+ safe_name = truncated_name + '.' + file_extension
436
+ else:
437
+ safe_name = safe_name[:20]
438
+ return safe_name
modules/utils/youtube_manager.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pytubefix import YouTube
2
+ import subprocess
3
+ import os
4
+
5
+
6
+ def get_ytdata(link):
7
+ return YouTube(link)
8
+
9
+
10
+ def get_ytmetas(link):
11
+ yt = YouTube(link)
12
+ return yt.thumbnail_url, yt.title, yt.description
13
+
14
+
15
+ def get_ytaudio(ytdata: YouTube):
16
+ # Somehow the audio is corrupted so need to convert to valid audio file.
17
+ # Fix for : https://github.com/jhj0517/Whisper-WebUI/issues/304
18
+
19
+ audio_path = ytdata.streams.get_audio_only().download(filename=os.path.join("modules", "yt_tmp.wav"))
20
+ temp_audio_path = os.path.join("modules", "yt_tmp_fixed.wav")
21
+
22
+ try:
23
+ subprocess.run([
24
+ 'ffmpeg', '-y',
25
+ '-i', audio_path,
26
+ temp_audio_path
27
+ ], check=True)
28
+
29
+ os.replace(temp_audio_path, audio_path)
30
+ return audio_path
31
+ except subprocess.CalledProcessError as e:
32
+ print(f"Error during ffmpeg conversion: {e}")
33
+ return None
modules/uvr/music_separator.py ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional, Union, List, Dict
2
+ import numpy as np
3
+ import torchaudio
4
+ import soundfile as sf
5
+ import os
6
+ import torch
7
+ import gc
8
+ import gradio as gr
9
+ from datetime import datetime
10
+
11
+ from uvr.models import MDX, Demucs, VrNetwork, MDXC
12
+ from modules.utils.paths import DEFAULT_PARAMETERS_CONFIG_PATH
13
+ from modules.utils.files_manager import load_yaml, save_yaml, is_video
14
+ from modules.diarize.audio_loader import load_audio
15
+
16
+ class MusicSeparator:
17
+ def __init__(self,
18
+ model_dir: Optional[str] = None,
19
+ output_dir: Optional[str] = None):
20
+ self.model = None
21
+ self.device = self.get_device()
22
+ self.available_devices = ["cpu", "cuda"]
23
+ self.model_dir = model_dir
24
+ self.output_dir = output_dir
25
+ instrumental_output_dir = os.path.join(self.output_dir, "instrumental")
26
+ vocals_output_dir = os.path.join(self.output_dir, "vocals")
27
+ os.makedirs(instrumental_output_dir, exist_ok=True)
28
+ os.makedirs(vocals_output_dir, exist_ok=True)
29
+ self.audio_info = None
30
+ self.available_models = ["UVR-MDX-NET-Inst_HQ_4", "UVR-MDX-NET-Inst_3"]
31
+ self.default_model = self.available_models[0]
32
+ self.current_model_size = self.default_model
33
+ self.model_config = {
34
+ "segment": 256,
35
+ "split": True
36
+ }
37
+
38
+ def update_model(self,
39
+ model_name: str = "UVR-MDX-NET-Inst_1",
40
+ device: Optional[str] = None,
41
+ segment_size: int = 256):
42
+ """
43
+ Update model with the given model name
44
+
45
+ Args:
46
+ model_name (str): Model name.
47
+ device (str): Device to use for the model.
48
+ segment_size (int): Segment size for the prediction.
49
+ """
50
+ if device is None:
51
+ device = self.device
52
+
53
+ self.device = device
54
+ self.model_config = {
55
+ "segment": segment_size,
56
+ "split": True
57
+ }
58
+ self.model = MDX(name=model_name,
59
+ other_metadata=self.model_config,
60
+ device=self.device,
61
+ logger=None,
62
+ model_dir=self.model_dir)
63
+
64
+ def separate(self,
65
+ audio: Union[str, np.ndarray],
66
+ model_name: str,
67
+ device: Optional[str] = None,
68
+ segment_size: int = 256,
69
+ save_file: bool = False,
70
+ progress: gr.Progress = gr.Progress()) -> tuple[np.ndarray, np.ndarray, List]:
71
+ """
72
+ Separate the background music from the audio.
73
+
74
+ Args:
75
+ audio (Union[str, np.ndarray]): Audio path or numpy array.
76
+ model_name (str): Model name.
77
+ device (str): Device to use for the model.
78
+ segment_size (int): Segment size for the prediction.
79
+ save_file (bool): Whether to save the separated audio to output path or not.
80
+ progress (gr.Progress): Gradio progress indicator.
81
+
82
+ Returns:
83
+ A Tuple of
84
+ np.ndarray: Instrumental numpy arrays.
85
+ np.ndarray: Vocals numpy arrays.
86
+ file_paths: List of file paths where the separated audio is saved. Return empty when save_file is False.
87
+ """
88
+ if isinstance(audio, str):
89
+ output_filename, ext = os.path.basename(audio), ".wav"
90
+ output_filename, orig_ext = os.path.splitext(output_filename)
91
+
92
+ if is_video(audio):
93
+ audio = load_audio(audio)
94
+ sample_rate = 16000
95
+ else:
96
+ self.audio_info = torchaudio.info(audio)
97
+ sample_rate = self.audio_info.sample_rate
98
+ else:
99
+ timestamp = datetime.now().strftime("%m%d%H%M%S")
100
+ output_filename, ext = f"UVR-{timestamp}", ".wav"
101
+ sample_rate = 16000
102
+
103
+ model_config = {
104
+ "segment": segment_size,
105
+ "split": True
106
+ }
107
+
108
+ if (self.model is None or
109
+ self.current_model_size != model_name or
110
+ self.model_config != model_config or
111
+ self.model.sample_rate != sample_rate or
112
+ self.device != device):
113
+ progress(0, desc="Initializing UVR Model..")
114
+ self.update_model(
115
+ model_name=model_name,
116
+ device=device,
117
+ segment_size=segment_size
118
+ )
119
+ self.model.sample_rate = sample_rate
120
+
121
+ progress(0, desc="Separating background music from the audio..")
122
+ result = self.model(audio)
123
+ instrumental, vocals = result["instrumental"].T, result["vocals"].T
124
+
125
+ file_paths = []
126
+ if save_file:
127
+ instrumental_output_path = os.path.join(self.output_dir, "instrumental", f"{output_filename}-instrumental{ext}")
128
+ vocals_output_path = os.path.join(self.output_dir, "vocals", f"{output_filename}-vocals{ext}")
129
+ sf.write(instrumental_output_path, instrumental, sample_rate, format="WAV")
130
+ sf.write(vocals_output_path, vocals, sample_rate, format="WAV")
131
+ file_paths += [instrumental_output_path, vocals_output_path]
132
+
133
+ return instrumental, vocals, file_paths
134
+
135
+ def separate_files(self,
136
+ files: List,
137
+ model_name: str,
138
+ device: Optional[str] = None,
139
+ segment_size: int = 256,
140
+ save_file: bool = True,
141
+ progress: gr.Progress = gr.Progress()) -> List[str]:
142
+ """Separate the background music from the audio files. Returns only last Instrumental and vocals file paths
143
+ to display into gr.Audio()"""
144
+ self.cache_parameters(model_size=model_name, segment_size=segment_size)
145
+
146
+ for file_path in files:
147
+ instrumental, vocals, file_paths = self.separate(
148
+ audio=file_path,
149
+ model_name=model_name,
150
+ device=device,
151
+ segment_size=segment_size,
152
+ save_file=save_file,
153
+ progress=progress
154
+ )
155
+ return file_paths
156
+
157
+ @staticmethod
158
+ def get_device():
159
+ """Get device for the model"""
160
+ return "cuda" if torch.cuda.is_available() else "cpu"
161
+
162
+ def offload(self):
163
+ """Offload the model and free up the memory"""
164
+ if self.model is not None:
165
+ del self.model
166
+ self.model = None
167
+ if self.device == "cuda":
168
+ torch.cuda.empty_cache()
169
+ gc.collect()
170
+ self.audio_info = None
171
+
172
+ @staticmethod
173
+ def cache_parameters(model_size: str,
174
+ segment_size: int):
175
+ cached_params = load_yaml(DEFAULT_PARAMETERS_CONFIG_PATH)
176
+ cached_uvr_params = cached_params["bgm_separation"]
177
+ uvr_params_to_cache = {
178
+ "model_size": model_size,
179
+ "segment_size": segment_size
180
+ }
181
+ cached_uvr_params = {**cached_uvr_params, **uvr_params_to_cache}
182
+ cached_params["bgm_separation"] = cached_uvr_params
183
+ save_yaml(cached_params, DEFAULT_PARAMETERS_CONFIG_PATH)
modules/vad/__init__.py ADDED
File without changes
modules/vad/silero_vad.py ADDED
@@ -0,0 +1,265 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Adapted from https://github.com/SYSTRAN/faster-whisper/blob/master/faster_whisper/vad.py
2
+
3
+ from faster_whisper.vad import VadOptions, get_vad_model
4
+ import numpy as np
5
+ from typing import BinaryIO, Union, List, Optional, Tuple
6
+ import warnings
7
+ import faster_whisper
8
+ from modules.whisper.data_classes import *
9
+ from faster_whisper.transcribe import SpeechTimestampsMap
10
+ import gradio as gr
11
+
12
+
13
+ class SileroVAD:
14
+ def __init__(self):
15
+ self.sampling_rate = 16000
16
+ self.window_size_samples = 512
17
+ self.model = None
18
+
19
+ def run(self,
20
+ audio: Union[str, BinaryIO, np.ndarray],
21
+ vad_parameters: VadOptions,
22
+ progress: gr.Progress = gr.Progress()
23
+ ) -> Tuple[np.ndarray, List[dict]]:
24
+ """
25
+ Run VAD
26
+
27
+ Parameters
28
+ ----------
29
+ audio: Union[str, BinaryIO, np.ndarray]
30
+ Audio path or file binary or Audio numpy array
31
+ vad_parameters:
32
+ Options for VAD processing.
33
+ progress: gr.Progress
34
+ Indicator to show progress directly in gradio.
35
+
36
+ Returns
37
+ ----------
38
+ np.ndarray
39
+ Pre-processed audio with VAD
40
+ List[dict]
41
+ Chunks of speeches to be used to restore the timestamps later
42
+ """
43
+
44
+ sampling_rate = self.sampling_rate
45
+
46
+ if not isinstance(audio, np.ndarray):
47
+ audio = faster_whisper.decode_audio(audio, sampling_rate=sampling_rate)
48
+
49
+ duration = audio.shape[0] / sampling_rate
50
+ duration_after_vad = duration
51
+
52
+ if vad_parameters is None:
53
+ vad_parameters = VadOptions()
54
+ elif isinstance(vad_parameters, dict):
55
+ vad_parameters = VadOptions(**vad_parameters)
56
+ speech_chunks = self.get_speech_timestamps(
57
+ audio=audio,
58
+ vad_options=vad_parameters,
59
+ progress=progress
60
+ )
61
+ audio = self.collect_chunks(audio, speech_chunks)
62
+ duration_after_vad = audio.shape[0] / sampling_rate
63
+
64
+ return audio, speech_chunks
65
+
66
+ def get_speech_timestamps(
67
+ self,
68
+ audio: np.ndarray,
69
+ vad_options: Optional[VadOptions] = None,
70
+ progress: gr.Progress = gr.Progress(),
71
+ **kwargs,
72
+ ) -> List[dict]:
73
+ """This method is used for splitting long audios into speech chunks using silero VAD.
74
+
75
+ Args:
76
+ audio: One dimensional float array.
77
+ vad_options: Options for VAD processing.
78
+ kwargs: VAD options passed as keyword arguments for backward compatibility.
79
+ progress: Gradio progress to indicate progress.
80
+
81
+ Returns:
82
+ List of dicts containing begin and end samples of each speech chunk.
83
+ """
84
+
85
+ if self.model is None:
86
+ self.update_model()
87
+
88
+ if vad_options is None:
89
+ vad_options = VadOptions(**kwargs)
90
+
91
+ threshold = vad_options.threshold
92
+ min_speech_duration_ms = vad_options.min_speech_duration_ms
93
+ max_speech_duration_s = vad_options.max_speech_duration_s
94
+ min_silence_duration_ms = vad_options.min_silence_duration_ms
95
+ window_size_samples = self.window_size_samples
96
+ speech_pad_ms = vad_options.speech_pad_ms
97
+ sampling_rate = 16000
98
+ min_speech_samples = sampling_rate * min_speech_duration_ms / 1000
99
+ speech_pad_samples = sampling_rate * speech_pad_ms / 1000
100
+ max_speech_samples = (
101
+ sampling_rate * max_speech_duration_s
102
+ - window_size_samples
103
+ - 2 * speech_pad_samples
104
+ )
105
+ min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
106
+ min_silence_samples_at_max_speech = sampling_rate * 98 / 1000
107
+
108
+ audio_length_samples = len(audio)
109
+
110
+ state, context = self.model.get_initial_states(batch_size=1)
111
+
112
+ speech_probs = []
113
+ for current_start_sample in range(0, audio_length_samples, window_size_samples):
114
+ progress(current_start_sample/audio_length_samples, desc="Detecting speeches only using VAD...")
115
+
116
+ chunk = audio[current_start_sample: current_start_sample + window_size_samples]
117
+ if len(chunk) < window_size_samples:
118
+ chunk = np.pad(chunk, (0, int(window_size_samples - len(chunk))))
119
+ speech_prob, state, context = self.model(chunk, state, context, sampling_rate)
120
+ speech_probs.append(speech_prob)
121
+
122
+ triggered = False
123
+ speeches = []
124
+ current_speech = {}
125
+ neg_threshold = threshold - 0.15
126
+
127
+ # to save potential segment end (and tolerate some silence)
128
+ temp_end = 0
129
+ # to save potential segment limits in case of maximum segment size reached
130
+ prev_end = next_start = 0
131
+
132
+ for i, speech_prob in enumerate(speech_probs):
133
+ if (speech_prob >= threshold) and temp_end:
134
+ temp_end = 0
135
+ if next_start < prev_end:
136
+ next_start = window_size_samples * i
137
+
138
+ if (speech_prob >= threshold) and not triggered:
139
+ triggered = True
140
+ current_speech["start"] = window_size_samples * i
141
+ continue
142
+
143
+ if (
144
+ triggered
145
+ and (window_size_samples * i) - current_speech["start"] > max_speech_samples
146
+ ):
147
+ if prev_end:
148
+ current_speech["end"] = prev_end
149
+ speeches.append(current_speech)
150
+ current_speech = {}
151
+ # previously reached silence (< neg_thres) and is still not speech (< thres)
152
+ if next_start < prev_end:
153
+ triggered = False
154
+ else:
155
+ current_speech["start"] = next_start
156
+ prev_end = next_start = temp_end = 0
157
+ else:
158
+ current_speech["end"] = window_size_samples * i
159
+ speeches.append(current_speech)
160
+ current_speech = {}
161
+ prev_end = next_start = temp_end = 0
162
+ triggered = False
163
+ continue
164
+
165
+ if (speech_prob < neg_threshold) and triggered:
166
+ if not temp_end:
167
+ temp_end = window_size_samples * i
168
+ # condition to avoid cutting in very short silence
169
+ if (window_size_samples * i) - temp_end > min_silence_samples_at_max_speech:
170
+ prev_end = temp_end
171
+ if (window_size_samples * i) - temp_end < min_silence_samples:
172
+ continue
173
+ else:
174
+ current_speech["end"] = temp_end
175
+ if (
176
+ current_speech["end"] - current_speech["start"]
177
+ ) > min_speech_samples:
178
+ speeches.append(current_speech)
179
+ current_speech = {}
180
+ prev_end = next_start = temp_end = 0
181
+ triggered = False
182
+ continue
183
+
184
+ if (
185
+ current_speech
186
+ and (audio_length_samples - current_speech["start"]) > min_speech_samples
187
+ ):
188
+ current_speech["end"] = audio_length_samples
189
+ speeches.append(current_speech)
190
+
191
+ for i, speech in enumerate(speeches):
192
+ if i == 0:
193
+ speech["start"] = int(max(0, speech["start"] - speech_pad_samples))
194
+ if i != len(speeches) - 1:
195
+ silence_duration = speeches[i + 1]["start"] - speech["end"]
196
+ if silence_duration < 2 * speech_pad_samples:
197
+ speech["end"] += int(silence_duration // 2)
198
+ speeches[i + 1]["start"] = int(
199
+ max(0, speeches[i + 1]["start"] - silence_duration // 2)
200
+ )
201
+ else:
202
+ speech["end"] = int(
203
+ min(audio_length_samples, speech["end"] + speech_pad_samples)
204
+ )
205
+ speeches[i + 1]["start"] = int(
206
+ max(0, speeches[i + 1]["start"] - speech_pad_samples)
207
+ )
208
+ else:
209
+ speech["end"] = int(
210
+ min(audio_length_samples, speech["end"] + speech_pad_samples)
211
+ )
212
+
213
+ return speeches
214
+
215
+ def update_model(self):
216
+ self.model = get_vad_model()
217
+
218
+ @staticmethod
219
+ def collect_chunks(audio: np.ndarray, chunks: List[dict]) -> np.ndarray:
220
+ """Collects and concatenates audio chunks."""
221
+ if not chunks:
222
+ return np.array([], dtype=np.float32)
223
+
224
+ return np.concatenate([audio[chunk["start"]: chunk["end"]] for chunk in chunks])
225
+
226
+ @staticmethod
227
+ def format_timestamp(
228
+ seconds: float,
229
+ always_include_hours: bool = False,
230
+ decimal_marker: str = ".",
231
+ ) -> str:
232
+ assert seconds >= 0, "non-negative timestamp expected"
233
+ milliseconds = round(seconds * 1000.0)
234
+
235
+ hours = milliseconds // 3_600_000
236
+ milliseconds -= hours * 3_600_000
237
+
238
+ minutes = milliseconds // 60_000
239
+ milliseconds -= minutes * 60_000
240
+
241
+ seconds = milliseconds // 1_000
242
+ milliseconds -= seconds * 1_000
243
+
244
+ hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else ""
245
+ return (
246
+ f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}"
247
+ )
248
+
249
+ def restore_speech_timestamps(
250
+ self,
251
+ segments: List[Segment],
252
+ speech_chunks: List[dict],
253
+ sampling_rate: Optional[int] = None,
254
+ ) -> List[Segment]:
255
+ if sampling_rate is None:
256
+ sampling_rate = self.sampling_rate
257
+
258
+ ts_map = SpeechTimestampsMap(speech_chunks, sampling_rate)
259
+
260
+ for segment in segments:
261
+ segment.start = ts_map.get_original_time(segment.start)
262
+ segment.end = ts_map.get_original_time(segment.end)
263
+
264
+ return segments
265
+
modules/whisper/__init__.py ADDED
File without changes
modules/whisper/base_transcription_pipeline.py ADDED
@@ -0,0 +1,563 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import whisper
3
+ import ctranslate2
4
+ import gradio as gr
5
+ import torchaudio
6
+ from abc import ABC, abstractmethod
7
+ from typing import BinaryIO, Union, Tuple, List
8
+ import numpy as np
9
+ from datetime import datetime
10
+ from faster_whisper.vad import VadOptions
11
+
12
+ from modules.uvr.music_separator import MusicSeparator
13
+ from modules.utils.paths import (WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR, DEFAULT_PARAMETERS_CONFIG_PATH,
14
+ UVR_MODELS_DIR)
15
+ from modules.utils.constants import *
16
+ from modules.utils.subtitle_manager import *
17
+ from modules.utils.youtube_manager import get_ytdata, get_ytaudio
18
+ from modules.utils.files_manager import get_media_files, format_gradio_files, load_yaml, save_yaml, read_file
19
+ from modules.whisper.data_classes import *
20
+ from modules.diarize.diarizer import Diarizer
21
+ from modules.vad.silero_vad import SileroVAD
22
+
23
+
24
+ class BaseTranscriptionPipeline(ABC):
25
+ def __init__(self,
26
+ model_dir: str = WHISPER_MODELS_DIR,
27
+ diarization_model_dir: str = DIARIZATION_MODELS_DIR,
28
+ uvr_model_dir: str = UVR_MODELS_DIR,
29
+ output_dir: str = OUTPUT_DIR,
30
+ ):
31
+ self.model_dir = model_dir
32
+ self.output_dir = output_dir
33
+ os.makedirs(self.output_dir, exist_ok=True)
34
+ os.makedirs(self.model_dir, exist_ok=True)
35
+ self.diarizer = Diarizer(
36
+ model_dir=diarization_model_dir
37
+ )
38
+ self.vad = SileroVAD()
39
+ self.music_separator = MusicSeparator(
40
+ model_dir=uvr_model_dir,
41
+ output_dir=os.path.join(output_dir, "UVR")
42
+ )
43
+
44
+ self.model = None
45
+ self.current_model_size = None
46
+ self.available_models = whisper.available_models()
47
+ self.available_langs = sorted(list(whisper.tokenizer.LANGUAGES.values()))
48
+ self.translatable_models = ["large", "large-v1", "large-v2", "large-v3"]
49
+ self.device = self.get_device()
50
+ self.available_compute_types = self.get_available_compute_type()
51
+ self.current_compute_type = self.get_compute_type()
52
+
53
+ @abstractmethod
54
+ def transcribe(self,
55
+ audio: Union[str, BinaryIO, np.ndarray],
56
+ progress: gr.Progress = gr.Progress(),
57
+ *whisper_params,
58
+ ):
59
+ """Inference whisper model to transcribe"""
60
+ pass
61
+
62
+ @abstractmethod
63
+ def update_model(self,
64
+ model_size: str,
65
+ compute_type: str,
66
+ progress: gr.Progress = gr.Progress()
67
+ ):
68
+ """Initialize whisper model"""
69
+ pass
70
+
71
+ def run(self,
72
+ audio: Union[str, BinaryIO, np.ndarray],
73
+ progress: gr.Progress = gr.Progress(),
74
+ file_format: str = "SRT",
75
+ add_timestamp: bool = True,
76
+ *pipeline_params,
77
+ ) -> Tuple[List[Segment], float]:
78
+ """
79
+ Run transcription with conditional pre-processing and post-processing.
80
+ The VAD will be performed to remove noise from the audio input in pre-processing, if enabled.
81
+ The diarization will be performed in post-processing, if enabled.
82
+ Due to the integration with gradio, the parameters have to be specified with a `*` wildcard.
83
+
84
+ Parameters
85
+ ----------
86
+ audio: Union[str, BinaryIO, np.ndarray]
87
+ Audio input. This can be file path or binary type.
88
+ progress: gr.Progress
89
+ Indicator to show progress directly in gradio.
90
+ file_format: str
91
+ Subtitle file format between ["SRT", "WebVTT", "txt", "lrc"]
92
+ add_timestamp: bool
93
+ Whether to add a timestamp at the end of the filename.
94
+ *pipeline_params: tuple
95
+ Parameters for the transcription pipeline. This will be dealt with "TranscriptionPipelineParams" data class.
96
+ This must be provided as a List with * wildcard because of the integration with gradio.
97
+ See more info at : https://github.com/gradio-app/gradio/issues/2471
98
+
99
+ Returns
100
+ ----------
101
+ segments_result: List[Segment]
102
+ list of Segment that includes start, end timestamps and transcribed text
103
+ elapsed_time: float
104
+ elapsed time for running
105
+ """
106
+ params = TranscriptionPipelineParams.from_list(list(pipeline_params))
107
+ params = self.validate_gradio_values(params)
108
+ bgm_params, vad_params, whisper_params, diarization_params = params.bgm_separation, params.vad, params.whisper, params.diarization
109
+
110
+ if bgm_params.is_separate_bgm:
111
+ music, audio, _ = self.music_separator.separate(
112
+ audio=audio,
113
+ model_name=bgm_params.model_size,
114
+ device=bgm_params.device,
115
+ segment_size=bgm_params.segment_size,
116
+ save_file=bgm_params.save_file,
117
+ progress=progress
118
+ )
119
+
120
+ if audio.ndim >= 2:
121
+ audio = audio.mean(axis=1)
122
+ if self.music_separator.audio_info is None:
123
+ origin_sample_rate = 16000
124
+ else:
125
+ origin_sample_rate = self.music_separator.audio_info.sample_rate
126
+ audio = self.resample_audio(audio=audio, original_sample_rate=origin_sample_rate)
127
+
128
+ if bgm_params.enable_offload:
129
+ self.music_separator.offload()
130
+
131
+ if vad_params.vad_filter:
132
+ vad_options = VadOptions(
133
+ threshold=vad_params.threshold,
134
+ min_speech_duration_ms=vad_params.min_speech_duration_ms,
135
+ max_speech_duration_s=vad_params.max_speech_duration_s,
136
+ min_silence_duration_ms=vad_params.min_silence_duration_ms,
137
+ speech_pad_ms=vad_params.speech_pad_ms
138
+ )
139
+
140
+ vad_processed, speech_chunks = self.vad.run(
141
+ audio=audio,
142
+ vad_parameters=vad_options,
143
+ progress=progress
144
+ )
145
+
146
+ if vad_processed.size > 0:
147
+ audio = vad_processed
148
+ else:
149
+ vad_params.vad_filter = False
150
+
151
+ result, elapsed_time = self.transcribe(
152
+ audio,
153
+ progress,
154
+ *whisper_params.to_list()
155
+ )
156
+
157
+ if vad_params.vad_filter:
158
+ result = self.vad.restore_speech_timestamps(
159
+ segments=result,
160
+ speech_chunks=speech_chunks,
161
+ )
162
+
163
+ if diarization_params.is_diarize:
164
+ result, elapsed_time_diarization = self.diarizer.run(
165
+ audio=audio,
166
+ use_auth_token=diarization_params.hf_token,
167
+ transcribed_result=result,
168
+ device=diarization_params.device
169
+ )
170
+ elapsed_time += elapsed_time_diarization
171
+
172
+ self.cache_parameters(
173
+ params=params,
174
+ file_format=file_format,
175
+ add_timestamp=add_timestamp
176
+ )
177
+ return result, elapsed_time
178
+
179
+ def transcribe_file(self,
180
+ files: Optional[List] = None,
181
+ input_folder_path: Optional[str] = None,
182
+ file_format: str = "SRT",
183
+ add_timestamp: bool = True,
184
+ progress=gr.Progress(),
185
+ *pipeline_params,
186
+ ) -> Tuple[str, List]:
187
+ """
188
+ Write subtitle file from Files
189
+
190
+ Parameters
191
+ ----------
192
+ files: list
193
+ List of files to transcribe from gr.Files()
194
+ input_folder_path: str
195
+ Input folder path to transcribe from gr.Textbox(). If this is provided, `files` will be ignored and
196
+ this will be used instead.
197
+ file_format: str
198
+ Subtitle File format to write from gr.Dropdown(). Supported format: [SRT, WebVTT, txt]
199
+ add_timestamp: bool
200
+ Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the subtitle filename.
201
+ progress: gr.Progress
202
+ Indicator to show progress directly in gradio.
203
+ *pipeline_params: tuple
204
+ Parameters for the transcription pipeline. This will be dealt with "TranscriptionPipelineParams" data class
205
+
206
+ Returns
207
+ ----------
208
+ result_str:
209
+ Result of transcription to return to gr.Textbox()
210
+ result_file_path:
211
+ Output file path to return to gr.Files()
212
+ """
213
+ try:
214
+ params = TranscriptionPipelineParams.from_list(list(pipeline_params))
215
+ writer_options = {
216
+ "highlight_words": True if params.whisper.word_timestamps else False
217
+ }
218
+
219
+ if input_folder_path:
220
+ files = get_media_files(input_folder_path)
221
+ if isinstance(files, str):
222
+ files = [files]
223
+ if files and isinstance(files[0], gr.utils.NamedString):
224
+ files = [file.name for file in files]
225
+
226
+ files_info = {}
227
+ for file in files:
228
+ transcribed_segments, time_for_task = self.run(
229
+ file,
230
+ progress,
231
+ file_format,
232
+ add_timestamp,
233
+ *pipeline_params,
234
+ )
235
+
236
+ file_name, file_ext = os.path.splitext(os.path.basename(file))
237
+ subtitle, file_path = generate_file(
238
+ output_dir=self.output_dir,
239
+ output_file_name=file_name,
240
+ output_format=file_format,
241
+ result=transcribed_segments,
242
+ add_timestamp=add_timestamp,
243
+ **writer_options
244
+ )
245
+ files_info[file_name] = {"subtitle": read_file(file_path), "time_for_task": time_for_task, "path": file_path}
246
+
247
+ total_result = ''
248
+ total_time = 0
249
+ for file_name, info in files_info.items():
250
+ total_result += '------------------------------------\n'
251
+ total_result += f'{file_name}\n\n'
252
+ total_result += f'{info["subtitle"]}'
253
+ total_time += info["time_for_task"]
254
+
255
+ result_str = f"Done in {self.format_time(total_time)}! Subtitle is in the outputs folder.\n\n{total_result}"
256
+ result_file_path = [info['path'] for info in files_info.values()]
257
+
258
+ return result_str, result_file_path
259
+
260
+ except Exception as e:
261
+ print(f"Error transcribing file: {e}")
262
+ raise
263
+ finally:
264
+ self.release_cuda_memory()
265
+
266
+ def transcribe_mic(self,
267
+ mic_audio: str,
268
+ file_format: str = "SRT",
269
+ add_timestamp: bool = True,
270
+ progress=gr.Progress(),
271
+ *pipeline_params,
272
+ ) -> Tuple[str, str]:
273
+ """
274
+ Write subtitle file from microphone
275
+
276
+ Parameters
277
+ ----------
278
+ mic_audio: str
279
+ Audio file path from gr.Microphone()
280
+ file_format: str
281
+ Subtitle File format to write from gr.Dropdown(). Supported format: [SRT, WebVTT, txt]
282
+ add_timestamp: bool
283
+ Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename.
284
+ progress: gr.Progress
285
+ Indicator to show progress directly in gradio.
286
+ *pipeline_params: tuple
287
+ Parameters related with whisper. This will be dealt with "WhisperParameters" data class
288
+
289
+ Returns
290
+ ----------
291
+ result_str:
292
+ Result of transcription to return to gr.Textbox()
293
+ result_file_path:
294
+ Output file path to return to gr.Files()
295
+ """
296
+ try:
297
+ params = TranscriptionPipelineParams.from_list(list(pipeline_params))
298
+ writer_options = {
299
+ "highlight_words": True if params.whisper.word_timestamps else False
300
+ }
301
+
302
+ progress(0, desc="Loading Audio..")
303
+ transcribed_segments, time_for_task = self.run(
304
+ mic_audio,
305
+ progress,
306
+ file_format,
307
+ add_timestamp,
308
+ *pipeline_params,
309
+ )
310
+ progress(1, desc="Completed!")
311
+
312
+ file_name = "Mic"
313
+ subtitle, file_path = generate_file(
314
+ output_dir=self.output_dir,
315
+ output_file_name=file_name,
316
+ output_format=file_format,
317
+ result=transcribed_segments,
318
+ add_timestamp=add_timestamp,
319
+ **writer_options
320
+ )
321
+
322
+ result_str = f"Done in {self.format_time(time_for_task)}! Subtitle file is in the outputs folder.\n\n{subtitle}"
323
+ return result_str, file_path
324
+ except Exception as e:
325
+ print(f"Error transcribing mic: {e}")
326
+ raise
327
+ finally:
328
+ self.release_cuda_memory()
329
+
330
+ def transcribe_youtube(self,
331
+ youtube_link: str,
332
+ file_format: str = "SRT",
333
+ add_timestamp: bool = True,
334
+ progress=gr.Progress(),
335
+ *pipeline_params,
336
+ ) -> Tuple[str, str]:
337
+ """
338
+ Write subtitle file from Youtube
339
+
340
+ Parameters
341
+ ----------
342
+ youtube_link: str
343
+ URL of the Youtube video to transcribe from gr.Textbox()
344
+ file_format: str
345
+ Subtitle File format to write from gr.Dropdown(). Supported format: [SRT, WebVTT, txt]
346
+ add_timestamp: bool
347
+ Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename.
348
+ progress: gr.Progress
349
+ Indicator to show progress directly in gradio.
350
+ *pipeline_params: tuple
351
+ Parameters related with whisper. This will be dealt with "WhisperParameters" data class
352
+
353
+ Returns
354
+ ----------
355
+ result_str:
356
+ Result of transcription to return to gr.Textbox()
357
+ result_file_path:
358
+ Output file path to return to gr.Files()
359
+ """
360
+ try:
361
+ params = TranscriptionPipelineParams.from_list(list(pipeline_params))
362
+ writer_options = {
363
+ "highlight_words": True if params.whisper.word_timestamps else False
364
+ }
365
+
366
+ progress(0, desc="Loading Audio from Youtube..")
367
+ yt = get_ytdata(youtube_link)
368
+ audio = get_ytaudio(yt)
369
+
370
+ transcribed_segments, time_for_task = self.run(
371
+ audio,
372
+ progress,
373
+ file_format,
374
+ add_timestamp,
375
+ *pipeline_params,
376
+ )
377
+
378
+ progress(1, desc="Completed!")
379
+
380
+ file_name = safe_filename(yt.title)
381
+ subtitle, file_path = generate_file(
382
+ output_dir=self.output_dir,
383
+ output_file_name=file_name,
384
+ output_format=file_format,
385
+ result=transcribed_segments,
386
+ add_timestamp=add_timestamp,
387
+ **writer_options
388
+ )
389
+
390
+ result_str = f"Done in {self.format_time(time_for_task)}! Subtitle file is in the outputs folder.\n\n{subtitle}"
391
+
392
+ if os.path.exists(audio):
393
+ os.remove(audio)
394
+
395
+ return result_str, file_path
396
+
397
+ except Exception as e:
398
+ print(f"Error transcribing youtube: {e}")
399
+ raise
400
+ finally:
401
+ self.release_cuda_memory()
402
+
403
+ def get_compute_type(self):
404
+ if "float16" in self.available_compute_types:
405
+ return "float16"
406
+ if "float32" in self.available_compute_types:
407
+ return "float32"
408
+ else:
409
+ return self.available_compute_types[0]
410
+
411
+ def get_available_compute_type(self):
412
+ if self.device == "cuda":
413
+ return list(ctranslate2.get_supported_compute_types("cuda"))
414
+ else:
415
+ return list(ctranslate2.get_supported_compute_types("cpu"))
416
+
417
+ @staticmethod
418
+ def format_time(elapsed_time: float) -> str:
419
+ """
420
+ Get {hours} {minutes} {seconds} time format string
421
+
422
+ Parameters
423
+ ----------
424
+ elapsed_time: str
425
+ Elapsed time for transcription
426
+
427
+ Returns
428
+ ----------
429
+ Time format string
430
+ """
431
+ hours, rem = divmod(elapsed_time, 3600)
432
+ minutes, seconds = divmod(rem, 60)
433
+
434
+ time_str = ""
435
+ if hours:
436
+ time_str += f"{hours} hours "
437
+ if minutes:
438
+ time_str += f"{minutes} minutes "
439
+ seconds = round(seconds)
440
+ time_str += f"{seconds} seconds"
441
+
442
+ return time_str.strip()
443
+
444
+ @staticmethod
445
+ def get_device():
446
+ if torch.cuda.is_available():
447
+ return "cuda"
448
+ elif torch.backends.mps.is_available():
449
+ if not BaseTranscriptionPipeline.is_sparse_api_supported():
450
+ # Device `SparseMPS` is not supported for now. See : https://github.com/pytorch/pytorch/issues/87886
451
+ return "cpu"
452
+ return "mps"
453
+ else:
454
+ return "cpu"
455
+
456
+ @staticmethod
457
+ def is_sparse_api_supported():
458
+ if not torch.backends.mps.is_available():
459
+ return False
460
+
461
+ try:
462
+ device = torch.device("mps")
463
+ sparse_tensor = torch.sparse_coo_tensor(
464
+ indices=torch.tensor([[0, 1], [2, 3]]),
465
+ values=torch.tensor([1, 2]),
466
+ size=(4, 4),
467
+ device=device
468
+ )
469
+ return True
470
+ except RuntimeError:
471
+ return False
472
+
473
+ @staticmethod
474
+ def release_cuda_memory():
475
+ """Release memory"""
476
+ if torch.cuda.is_available():
477
+ torch.cuda.empty_cache()
478
+ torch.cuda.reset_max_memory_allocated()
479
+
480
+ @staticmethod
481
+ def remove_input_files(file_paths: List[str]):
482
+ """Remove gradio cached files"""
483
+ if not file_paths:
484
+ return
485
+
486
+ for file_path in file_paths:
487
+ if file_path and os.path.exists(file_path):
488
+ os.remove(file_path)
489
+
490
+ @staticmethod
491
+ def validate_gradio_values(params: TranscriptionPipelineParams):
492
+ """
493
+ Validate gradio specific values that can't be displayed as None in the UI.
494
+ Related issue : https://github.com/gradio-app/gradio/issues/8723
495
+ """
496
+ if params.whisper.lang is None:
497
+ pass
498
+ elif params.whisper.lang == AUTOMATIC_DETECTION:
499
+ params.whisper.lang = None
500
+ else:
501
+ language_code_dict = {value: key for key, value in whisper.tokenizer.LANGUAGES.items()}
502
+ params.whisper.lang = language_code_dict[params.whisper.lang]
503
+
504
+ if params.whisper.initial_prompt == GRADIO_NONE_STR:
505
+ params.whisper.initial_prompt = None
506
+ if params.whisper.prefix == GRADIO_NONE_STR:
507
+ params.whisper.prefix = None
508
+ if params.whisper.hotwords == GRADIO_NONE_STR:
509
+ params.whisper.hotwords = None
510
+ if params.whisper.max_new_tokens == GRADIO_NONE_NUMBER_MIN:
511
+ params.whisper.max_new_tokens = None
512
+ if params.whisper.hallucination_silence_threshold == GRADIO_NONE_NUMBER_MIN:
513
+ params.whisper.hallucination_silence_threshold = None
514
+ if params.whisper.language_detection_threshold == GRADIO_NONE_NUMBER_MIN:
515
+ params.whisper.language_detection_threshold = None
516
+ if params.vad.max_speech_duration_s == GRADIO_NONE_NUMBER_MAX:
517
+ params.vad.max_speech_duration_s = float('inf')
518
+ return params
519
+
520
+ @staticmethod
521
+ def cache_parameters(
522
+ params: TranscriptionPipelineParams,
523
+ file_format: str = "SRT",
524
+ add_timestamp: bool = True
525
+ ):
526
+ """Cache parameters to the yaml file"""
527
+ cached_params = load_yaml(DEFAULT_PARAMETERS_CONFIG_PATH)
528
+ param_to_cache = params.to_dict()
529
+
530
+ cached_yaml = {**cached_params, **param_to_cache}
531
+ cached_yaml["whisper"]["add_timestamp"] = add_timestamp
532
+ cached_yaml["whisper"]["file_format"] = file_format
533
+
534
+ supress_token = cached_yaml["whisper"].get("suppress_tokens", None)
535
+ if supress_token and isinstance(supress_token, list):
536
+ cached_yaml["whisper"]["suppress_tokens"] = str(supress_token)
537
+
538
+ if cached_yaml["whisper"].get("lang", None) is None:
539
+ cached_yaml["whisper"]["lang"] = AUTOMATIC_DETECTION.unwrap()
540
+ else:
541
+ language_dict = whisper.tokenizer.LANGUAGES
542
+ cached_yaml["whisper"]["lang"] = language_dict[cached_yaml["whisper"]["lang"]]
543
+
544
+ if cached_yaml["vad"].get("max_speech_duration_s", float('inf')) == float('inf'):
545
+ cached_yaml["vad"]["max_speech_duration_s"] = GRADIO_NONE_NUMBER_MAX
546
+
547
+ if cached_yaml is not None and cached_yaml:
548
+ save_yaml(cached_yaml, DEFAULT_PARAMETERS_CONFIG_PATH)
549
+
550
+ @staticmethod
551
+ def resample_audio(audio: Union[str, np.ndarray],
552
+ new_sample_rate: int = 16000,
553
+ original_sample_rate: Optional[int] = None,) -> np.ndarray:
554
+ """Resamples audio to 16k sample rate, standard on Whisper model"""
555
+ if isinstance(audio, str):
556
+ audio, original_sample_rate = torchaudio.load(audio)
557
+ else:
558
+ if original_sample_rate is None:
559
+ raise ValueError("original_sample_rate must be provided when audio is numpy array.")
560
+ audio = torch.from_numpy(audio)
561
+ resampler = torchaudio.transforms.Resample(orig_freq=original_sample_rate, new_freq=new_sample_rate)
562
+ resampled_audio = resampler(audio).numpy()
563
+ return resampled_audio
modules/whisper/data_classes.py ADDED
@@ -0,0 +1,608 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import faster_whisper.transcribe
2
+ import gradio as gr
3
+ import torch
4
+ from typing import Optional, Dict, List, Union, NamedTuple
5
+ from pydantic import BaseModel, Field, field_validator, ConfigDict
6
+ from gradio_i18n import Translate, gettext as _
7
+ from enum import Enum
8
+ from copy import deepcopy
9
+
10
+ import yaml
11
+
12
+ from modules.utils.constants import *
13
+
14
+
15
+ class WhisperImpl(Enum):
16
+ WHISPER = "whisper"
17
+ FASTER_WHISPER = "faster-whisper"
18
+ INSANELY_FAST_WHISPER = "insanely_fast_whisper"
19
+
20
+
21
+ class Segment(BaseModel):
22
+ id: Optional[int] = Field(default=None, description="Incremental id for the segment")
23
+ seek: Optional[int] = Field(default=None, description="Seek of the segment from chunked audio")
24
+ text: Optional[str] = Field(default=None, description="Transcription text of the segment")
25
+ start: Optional[float] = Field(default=None, description="Start time of the segment")
26
+ end: Optional[float] = Field(default=None, description="End time of the segment")
27
+ tokens: Optional[List[int]] = Field(default=None, description="List of token IDs")
28
+ temperature: Optional[float] = Field(default=None, description="Temperature used during the decoding process")
29
+ avg_logprob: Optional[float] = Field(default=None, description="Average log probability of the tokens")
30
+ compression_ratio: Optional[float] = Field(default=None, description="Compression ratio of the segment")
31
+ no_speech_prob: Optional[float] = Field(default=None, description="Probability that it's not speech")
32
+ words: Optional[List['Word']] = Field(default=None, description="List of words contained in the segment")
33
+
34
+ @classmethod
35
+ def from_faster_whisper(cls,
36
+ seg: faster_whisper.transcribe.Segment):
37
+ if seg.words is not None:
38
+ words = [
39
+ Word(
40
+ start=w.start,
41
+ end=w.end,
42
+ word=w.word,
43
+ probability=w.probability
44
+ ) for w in seg.words
45
+ ]
46
+ else:
47
+ words = None
48
+
49
+ return cls(
50
+ id=seg.id,
51
+ seek=seg.seek,
52
+ text=seg.text,
53
+ start=seg.start,
54
+ end=seg.end,
55
+ tokens=seg.tokens,
56
+ temperature=seg.temperature,
57
+ avg_logprob=seg.avg_logprob,
58
+ compression_ratio=seg.compression_ratio,
59
+ no_speech_prob=seg.no_speech_prob,
60
+ words=words
61
+ )
62
+
63
+
64
+ class Word(BaseModel):
65
+ start: Optional[float] = Field(default=None, description="Start time of the word")
66
+ end: Optional[float] = Field(default=None, description="Start time of the word")
67
+ word: Optional[str] = Field(default=None, description="Word text")
68
+ probability: Optional[float] = Field(default=None, description="Probability of the word")
69
+
70
+
71
+ class BaseParams(BaseModel):
72
+ model_config = ConfigDict(protected_namespaces=())
73
+
74
+ def to_dict(self) -> Dict:
75
+ return self.model_dump()
76
+
77
+ def to_list(self) -> List:
78
+ return list(self.model_dump().values())
79
+
80
+ @classmethod
81
+ def from_list(cls, data_list: List) -> 'BaseParams':
82
+ field_names = list(cls.model_fields.keys())
83
+ return cls(**dict(zip(field_names, data_list)))
84
+
85
+
86
+ class VadParams(BaseParams):
87
+ """Voice Activity Detection parameters"""
88
+ vad_filter: bool = Field(default=False, description="Enable voice activity detection to filter out non-speech parts")
89
+ threshold: float = Field(
90
+ default=0.5,
91
+ ge=0.0,
92
+ le=1.0,
93
+ description="Speech threshold for Silero VAD. Probabilities above this value are considered speech"
94
+ )
95
+ min_speech_duration_ms: int = Field(
96
+ default=250,
97
+ ge=0,
98
+ description="Final speech chunks shorter than this are discarded"
99
+ )
100
+ max_speech_duration_s: float = Field(
101
+ default=float("inf"),
102
+ gt=0,
103
+ description="Maximum duration of speech chunks in seconds"
104
+ )
105
+ min_silence_duration_ms: int = Field(
106
+ default=2000,
107
+ ge=0,
108
+ description="Minimum silence duration between speech chunks"
109
+ )
110
+ speech_pad_ms: int = Field(
111
+ default=400,
112
+ ge=0,
113
+ description="Padding added to each side of speech chunks"
114
+ )
115
+
116
+ @classmethod
117
+ def to_gradio_inputs(cls, defaults: Optional[Dict] = None) -> List[gr.components.base.FormComponent]:
118
+ return [
119
+ gr.Checkbox(
120
+ label=_("Enable Silero VAD Filter"),
121
+ value=defaults.get("vad_filter", cls.__fields__["vad_filter"].default),
122
+ interactive=True,
123
+ info=_("Enable this to transcribe only detected voice")
124
+ ),
125
+ gr.Slider(
126
+ minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold",
127
+ value=defaults.get("threshold", cls.__fields__["threshold"].default),
128
+ info="Lower it to be more sensitive to small sounds."
129
+ ),
130
+ gr.Number(
131
+ label="Minimum Speech Duration (ms)", precision=0,
132
+ value=defaults.get("min_speech_duration_ms", cls.__fields__["min_speech_duration_ms"].default),
133
+ info="Final speech chunks shorter than this time are thrown out"
134
+ ),
135
+ gr.Number(
136
+ label="Maximum Speech Duration (s)",
137
+ value=defaults.get("max_speech_duration_s", GRADIO_NONE_NUMBER_MAX),
138
+ info="Maximum duration of speech chunks in \"seconds\"."
139
+ ),
140
+ gr.Number(
141
+ label="Minimum Silence Duration (ms)", precision=0,
142
+ value=defaults.get("min_silence_duration_ms", cls.__fields__["min_silence_duration_ms"].default),
143
+ info="In the end of each speech chunk wait for this time before separating it"
144
+ ),
145
+ gr.Number(
146
+ label="Speech Padding (ms)", precision=0,
147
+ value=defaults.get("speech_pad_ms", cls.__fields__["speech_pad_ms"].default),
148
+ info="Final speech chunks are padded by this time each side"
149
+ )
150
+ ]
151
+
152
+
153
+ class DiarizationParams(BaseParams):
154
+ """Speaker diarization parameters"""
155
+ is_diarize: bool = Field(default=False, description="Enable speaker diarization")
156
+ device: str = Field(default="cuda", description="Device to run Diarization model.")
157
+ hf_token: str = Field(
158
+ default="",
159
+ description="Hugging Face token for downloading diarization models"
160
+ )
161
+
162
+ @classmethod
163
+ def to_gradio_inputs(cls,
164
+ defaults: Optional[Dict] = None,
165
+ available_devices: Optional[List] = None,
166
+ device: Optional[str] = None) -> List[gr.components.base.FormComponent]:
167
+ return [
168
+ gr.Checkbox(
169
+ label=_("Enable Diarization"),
170
+ value=defaults.get("is_diarize", cls.__fields__["is_diarize"].default),
171
+ ),
172
+ gr.Dropdown(
173
+ label=_("Device"),
174
+ choices=["cpu", "cuda"] if available_devices is None else available_devices,
175
+ value=defaults.get("device", device),
176
+ ),
177
+ gr.Textbox(
178
+ label=_("HuggingFace Token"),
179
+ value=defaults.get("hf_token", cls.__fields__["hf_token"].default),
180
+ info=_("This is only needed the first time you download the model")
181
+ ),
182
+ ]
183
+
184
+
185
+ class BGMSeparationParams(BaseParams):
186
+ """Background music separation parameters"""
187
+ is_separate_bgm: bool = Field(default=False, description="Enable background music separation")
188
+ model_size: str = Field(
189
+ default="UVR-MDX-NET-Inst_HQ_4",
190
+ description="UVR model size"
191
+ )
192
+ device: str = Field(default="cuda", description="Device to run UVR model.")
193
+ segment_size: int = Field(
194
+ default=256,
195
+ gt=0,
196
+ description="Segment size for UVR model"
197
+ )
198
+ save_file: bool = Field(
199
+ default=False,
200
+ description="Whether to save separated audio files"
201
+ )
202
+ enable_offload: bool = Field(
203
+ default=True,
204
+ description="Offload UVR model after transcription"
205
+ )
206
+
207
+ @classmethod
208
+ def to_gradio_input(cls,
209
+ defaults: Optional[Dict] = None,
210
+ available_devices: Optional[List] = None,
211
+ device: Optional[str] = None,
212
+ available_models: Optional[List] = None) -> List[gr.components.base.FormComponent]:
213
+ return [
214
+ gr.Checkbox(
215
+ label=_("Enable Background Music Remover Filter"),
216
+ value=defaults.get("is_separate_bgm", cls.__fields__["is_separate_bgm"].default),
217
+ interactive=True,
218
+ info=_("Enabling this will remove background music")
219
+ ),
220
+ gr.Dropdown(
221
+ label=_("Model"),
222
+ choices=["UVR-MDX-NET-Inst_HQ_4",
223
+ "UVR-MDX-NET-Inst_3"] if available_models is None else available_models,
224
+ value=defaults.get("model_size", cls.__fields__["model_size"].default),
225
+ ),
226
+ gr.Dropdown(
227
+ label=_("Device"),
228
+ choices=["cpu", "cuda"] if available_devices is None else available_devices,
229
+ value=defaults.get("device", device),
230
+ ),
231
+ gr.Number(
232
+ label="Segment Size",
233
+ value=defaults.get("segment_size", cls.__fields__["segment_size"].default),
234
+ precision=0,
235
+ info="Segment size for UVR model"
236
+ ),
237
+ gr.Checkbox(
238
+ label=_("Save separated files to output"),
239
+ value=defaults.get("save_file", cls.__fields__["save_file"].default),
240
+ ),
241
+ gr.Checkbox(
242
+ label=_("Offload sub model after removing background music"),
243
+ value=defaults.get("enable_offload", cls.__fields__["enable_offload"].default),
244
+ )
245
+ ]
246
+
247
+
248
+ class WhisperParams(BaseParams):
249
+ """Whisper parameters"""
250
+ model_size: str = Field(default="large-v2", description="Whisper model size")
251
+ lang: Optional[str] = Field(default=None, description="Source language of the file to transcribe")
252
+ is_translate: bool = Field(default=False, description="Translate speech to English end-to-end")
253
+ beam_size: int = Field(default=5, ge=1, description="Beam size for decoding")
254
+ log_prob_threshold: float = Field(
255
+ default=-1.0,
256
+ description="Threshold for average log probability of sampled tokens"
257
+ )
258
+ no_speech_threshold: float = Field(
259
+ default=0.6,
260
+ ge=0.0,
261
+ le=1.0,
262
+ description="Threshold for detecting silence"
263
+ )
264
+ compute_type: str = Field(default="float16", description="Computation type for transcription")
265
+ best_of: int = Field(default=5, ge=1, description="Number of candidates when sampling")
266
+ patience: float = Field(default=1.0, gt=0, description="Beam search patience factor")
267
+ condition_on_previous_text: bool = Field(
268
+ default=True,
269
+ description="Use previous output as prompt for next window"
270
+ )
271
+ prompt_reset_on_temperature: float = Field(
272
+ default=0.5,
273
+ ge=0.0,
274
+ le=1.0,
275
+ description="Temperature threshold for resetting prompt"
276
+ )
277
+ initial_prompt: Optional[str] = Field(default=None, description="Initial prompt for first window")
278
+ temperature: float = Field(
279
+ default=0.0,
280
+ ge=0.0,
281
+ description="Temperature for sampling"
282
+ )
283
+ compression_ratio_threshold: float = Field(
284
+ default=2.4,
285
+ gt=0,
286
+ description="Threshold for gzip compression ratio"
287
+ )
288
+ length_penalty: float = Field(default=1.0, gt=0, description="Exponential length penalty")
289
+ repetition_penalty: float = Field(default=1.0, gt=0, description="Penalty for repeated tokens")
290
+ no_repeat_ngram_size: int = Field(default=0, ge=0, description="Size of n-grams to prevent repetition")
291
+ prefix: Optional[str] = Field(default=None, description="Prefix text for first window")
292
+ suppress_blank: bool = Field(
293
+ default=True,
294
+ description="Suppress blank outputs at start of sampling"
295
+ )
296
+ suppress_tokens: Optional[Union[List[int], str]] = Field(default=[-1], description="Token IDs to suppress")
297
+ max_initial_timestamp: float = Field(
298
+ default=1.0,
299
+ ge=0.0,
300
+ description="Maximum initial timestamp"
301
+ )
302
+ word_timestamps: bool = Field(default=False, description="Extract word-level timestamps")
303
+ prepend_punctuations: Optional[str] = Field(
304
+ default="\"'“¿([{-",
305
+ description="Punctuations to merge with next word"
306
+ )
307
+ append_punctuations: Optional[str] = Field(
308
+ default="\"'.。,,!!??::”)]}、",
309
+ description="Punctuations to merge with previous word"
310
+ )
311
+ max_new_tokens: Optional[int] = Field(default=None, description="Maximum number of new tokens per chunk")
312
+ chunk_length: Optional[int] = Field(default=30, description="Length of audio segments in seconds")
313
+ hallucination_silence_threshold: Optional[float] = Field(
314
+ default=None,
315
+ description="Threshold for skipping silent periods in hallucination detection"
316
+ )
317
+ hotwords: Optional[str] = Field(default=None, description="Hotwords/hint phrases for the model")
318
+ language_detection_threshold: Optional[float] = Field(
319
+ default=None,
320
+ description="Threshold for language detection probability"
321
+ )
322
+ language_detection_segments: int = Field(
323
+ default=1,
324
+ gt=0,
325
+ description="Number of segments for language detection"
326
+ )
327
+ batch_size: int = Field(default=24, gt=0, description="Batch size for processing")
328
+
329
+ @field_validator('lang')
330
+ def validate_lang(cls, v):
331
+ from modules.utils.constants import AUTOMATIC_DETECTION
332
+ return None if v == AUTOMATIC_DETECTION.unwrap() else v
333
+
334
+ @field_validator('suppress_tokens')
335
+ def validate_supress_tokens(cls, v):
336
+ import ast
337
+ try:
338
+ if isinstance(v, str):
339
+ suppress_tokens = ast.literal_eval(v)
340
+ if not isinstance(suppress_tokens, list):
341
+ raise ValueError("Invalid Suppress Tokens. The value must be type of List[int]")
342
+ return suppress_tokens
343
+ if isinstance(v, list):
344
+ return v
345
+ except Exception as e:
346
+ raise ValueError(f"Invalid Suppress Tokens. The value must be type of List[int]: {e}")
347
+
348
+ @classmethod
349
+ def to_gradio_inputs(cls,
350
+ defaults: Optional[Dict] = None,
351
+ only_advanced: Optional[bool] = True,
352
+ whisper_type: Optional[str] = None,
353
+ available_models: Optional[List] = None,
354
+ available_langs: Optional[List] = None,
355
+ available_compute_types: Optional[List] = None,
356
+ compute_type: Optional[str] = None):
357
+ whisper_type = WhisperImpl.FASTER_WHISPER.value if whisper_type is None else whisper_type.strip().lower()
358
+
359
+ inputs = []
360
+ if not only_advanced:
361
+ inputs += [
362
+ gr.Dropdown(
363
+ label=_("Model"),
364
+ choices=available_models,
365
+ value=defaults.get("model_size", cls.__fields__["model_size"].default),
366
+ ),
367
+ gr.Dropdown(
368
+ label=_("Language"),
369
+ choices=available_langs,
370
+ value=defaults.get("lang", AUTOMATIC_DETECTION),
371
+ ),
372
+ gr.Checkbox(
373
+ label=_("Translate to English?"),
374
+ value=defaults.get("is_translate", cls.__fields__["is_translate"].default),
375
+ ),
376
+ ]
377
+
378
+ inputs += [
379
+ gr.Number(
380
+ label="Beam Size",
381
+ value=defaults.get("beam_size", cls.__fields__["beam_size"].default),
382
+ precision=0,
383
+ info="Beam size for decoding"
384
+ ),
385
+ gr.Number(
386
+ label="Log Probability Threshold",
387
+ value=defaults.get("log_prob_threshold", cls.__fields__["log_prob_threshold"].default),
388
+ info="Threshold for average log probability of sampled tokens"
389
+ ),
390
+ gr.Number(
391
+ label="No Speech Threshold",
392
+ value=defaults.get("no_speech_threshold", cls.__fields__["no_speech_threshold"].default),
393
+ info="Threshold for detecting silence"
394
+ ),
395
+ gr.Dropdown(
396
+ label="Compute Type",
397
+ choices=["float16", "int8", "int16"] if available_compute_types is None else available_compute_types,
398
+ value=defaults.get("compute_type", compute_type),
399
+ info="Computation type for transcription"
400
+ ),
401
+ gr.Number(
402
+ label="Best Of",
403
+ value=defaults.get("best_of", cls.__fields__["best_of"].default),
404
+ precision=0,
405
+ info="Number of candidates when sampling"
406
+ ),
407
+ gr.Number(
408
+ label="Patience",
409
+ value=defaults.get("patience", cls.__fields__["patience"].default),
410
+ info="Beam search patience factor"
411
+ ),
412
+ gr.Checkbox(
413
+ label="Condition On Previous Text",
414
+ value=defaults.get("condition_on_previous_text", cls.__fields__["condition_on_previous_text"].default),
415
+ info="Use previous output as prompt for next window"
416
+ ),
417
+ gr.Slider(
418
+ label="Prompt Reset On Temperature",
419
+ value=defaults.get("prompt_reset_on_temperature",
420
+ cls.__fields__["prompt_reset_on_temperature"].default),
421
+ minimum=0,
422
+ maximum=1,
423
+ step=0.01,
424
+ info="Temperature threshold for resetting prompt"
425
+ ),
426
+ gr.Textbox(
427
+ label="Initial Prompt",
428
+ value=defaults.get("initial_prompt", GRADIO_NONE_STR),
429
+ info="Initial prompt for first window"
430
+ ),
431
+ gr.Slider(
432
+ label="Temperature",
433
+ value=defaults.get("temperature", cls.__fields__["temperature"].default),
434
+ minimum=0.0,
435
+ step=0.01,
436
+ maximum=1.0,
437
+ info="Temperature for sampling"
438
+ ),
439
+ gr.Number(
440
+ label="Compression Ratio Threshold",
441
+ value=defaults.get("compression_ratio_threshold",
442
+ cls.__fields__["compression_ratio_threshold"].default),
443
+ info="Threshold for gzip compression ratio"
444
+ )
445
+ ]
446
+
447
+ faster_whisper_inputs = [
448
+ gr.Number(
449
+ label="Length Penalty",
450
+ value=defaults.get("length_penalty", cls.__fields__["length_penalty"].default),
451
+ info="Exponential length penalty",
452
+ ),
453
+ gr.Number(
454
+ label="Repetition Penalty",
455
+ value=defaults.get("repetition_penalty", cls.__fields__["repetition_penalty"].default),
456
+ info="Penalty for repeated tokens"
457
+ ),
458
+ gr.Number(
459
+ label="No Repeat N-gram Size",
460
+ value=defaults.get("no_repeat_ngram_size", cls.__fields__["no_repeat_ngram_size"].default),
461
+ precision=0,
462
+ info="Size of n-grams to prevent repetition"
463
+ ),
464
+ gr.Textbox(
465
+ label="Prefix",
466
+ value=defaults.get("prefix", GRADIO_NONE_STR),
467
+ info="Prefix text for first window"
468
+ ),
469
+ gr.Checkbox(
470
+ label="Suppress Blank",
471
+ value=defaults.get("suppress_blank", cls.__fields__["suppress_blank"].default),
472
+ info="Suppress blank outputs at start of sampling"
473
+ ),
474
+ gr.Textbox(
475
+ label="Suppress Tokens",
476
+ value=defaults.get("suppress_tokens", "[-1]"),
477
+ info="Token IDs to suppress"
478
+ ),
479
+ gr.Number(
480
+ label="Max Initial Timestamp",
481
+ value=defaults.get("max_initial_timestamp", cls.__fields__["max_initial_timestamp"].default),
482
+ info="Maximum initial timestamp"
483
+ ),
484
+ gr.Checkbox(
485
+ label="Word Timestamps",
486
+ value=defaults.get("word_timestamps", cls.__fields__["word_timestamps"].default),
487
+ info="Extract word-level timestamps"
488
+ ),
489
+ gr.Textbox(
490
+ label="Prepend Punctuations",
491
+ value=defaults.get("prepend_punctuations", cls.__fields__["prepend_punctuations"].default),
492
+ info="Punctuations to merge with next word"
493
+ ),
494
+ gr.Textbox(
495
+ label="Append Punctuations",
496
+ value=defaults.get("append_punctuations", cls.__fields__["append_punctuations"].default),
497
+ info="Punctuations to merge with previous word"
498
+ ),
499
+ gr.Number(
500
+ label="Max New Tokens",
501
+ value=defaults.get("max_new_tokens", GRADIO_NONE_NUMBER_MIN),
502
+ precision=0,
503
+ info="Maximum number of new tokens per chunk"
504
+ ),
505
+ gr.Number(
506
+ label="Chunk Length (s)",
507
+ value=defaults.get("chunk_length", cls.__fields__["chunk_length"].default),
508
+ precision=0,
509
+ info="Length of audio segments in seconds"
510
+ ),
511
+ gr.Number(
512
+ label="Hallucination Silence Threshold (sec)",
513
+ value=defaults.get("hallucination_silence_threshold",
514
+ GRADIO_NONE_NUMBER_MIN),
515
+ info="Threshold for skipping silent periods in hallucination detection"
516
+ ),
517
+ gr.Textbox(
518
+ label="Hotwords",
519
+ value=defaults.get("hotwords", cls.__fields__["hotwords"].default),
520
+ info="Hotwords/hint phrases for the model"
521
+ ),
522
+ gr.Number(
523
+ label="Language Detection Threshold",
524
+ value=defaults.get("language_detection_threshold",
525
+ GRADIO_NONE_NUMBER_MIN),
526
+ info="Threshold for language detection probability"
527
+ ),
528
+ gr.Number(
529
+ label="Language Detection Segments",
530
+ value=defaults.get("language_detection_segments",
531
+ cls.__fields__["language_detection_segments"].default),
532
+ precision=0,
533
+ info="Number of segments for language detection"
534
+ )
535
+ ]
536
+
537
+ insanely_fast_whisper_inputs = [
538
+ gr.Number(
539
+ label="Batch Size",
540
+ value=defaults.get("batch_size", cls.__fields__["batch_size"].default),
541
+ precision=0,
542
+ info="Batch size for processing"
543
+ )
544
+ ]
545
+
546
+ if whisper_type != WhisperImpl.FASTER_WHISPER.value:
547
+ for input_component in faster_whisper_inputs:
548
+ input_component.visible = False
549
+
550
+ if whisper_type != WhisperImpl.INSANELY_FAST_WHISPER.value:
551
+ for input_component in insanely_fast_whisper_inputs:
552
+ input_component.visible = False
553
+
554
+ inputs += faster_whisper_inputs + insanely_fast_whisper_inputs
555
+
556
+ return inputs
557
+
558
+
559
+ class TranscriptionPipelineParams(BaseModel):
560
+ """Transcription pipeline parameters"""
561
+ whisper: WhisperParams = Field(default_factory=WhisperParams)
562
+ vad: VadParams = Field(default_factory=VadParams)
563
+ diarization: DiarizationParams = Field(default_factory=DiarizationParams)
564
+ bgm_separation: BGMSeparationParams = Field(default_factory=BGMSeparationParams)
565
+
566
+ def to_dict(self) -> Dict:
567
+ data = {
568
+ "whisper": self.whisper.to_dict(),
569
+ "vad": self.vad.to_dict(),
570
+ "diarization": self.diarization.to_dict(),
571
+ "bgm_separation": self.bgm_separation.to_dict()
572
+ }
573
+ return data
574
+
575
+ def to_list(self) -> List:
576
+ """
577
+ Convert data class to the list because I have to pass the parameters as a list in the gradio.
578
+ Related Gradio issue: https://github.com/gradio-app/gradio/issues/2471
579
+ See more about Gradio pre-processing: https://www.gradio.app/docs/components
580
+ """
581
+ whisper_list = self.whisper.to_list()
582
+ vad_list = self.vad.to_list()
583
+ diarization_list = self.diarization.to_list()
584
+ bgm_sep_list = self.bgm_separation.to_list()
585
+ return whisper_list + vad_list + diarization_list + bgm_sep_list
586
+
587
+ @staticmethod
588
+ def from_list(pipeline_list: List) -> 'TranscriptionPipelineParams':
589
+ """Convert list to the data class again to use it in a function."""
590
+ data_list = deepcopy(pipeline_list)
591
+
592
+ whisper_list = data_list[0:len(WhisperParams.__annotations__)]
593
+ data_list = data_list[len(WhisperParams.__annotations__):]
594
+
595
+ vad_list = data_list[0:len(VadParams.__annotations__)]
596
+ data_list = data_list[len(VadParams.__annotations__):]
597
+
598
+ diarization_list = data_list[0:len(DiarizationParams.__annotations__)]
599
+ data_list = data_list[len(DiarizationParams.__annotations__):]
600
+
601
+ bgm_sep_list = data_list[0:len(BGMSeparationParams.__annotations__)]
602
+
603
+ return TranscriptionPipelineParams(
604
+ whisper=WhisperParams.from_list(whisper_list),
605
+ vad=VadParams.from_list(vad_list),
606
+ diarization=DiarizationParams.from_list(diarization_list),
607
+ bgm_separation=BGMSeparationParams.from_list(bgm_sep_list)
608
+ )
modules/whisper/faster_whisper_inference.py ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+ import numpy as np
4
+ import torch
5
+ from typing import BinaryIO, Union, Tuple, List
6
+ import faster_whisper
7
+ from faster_whisper.vad import VadOptions
8
+ import ast
9
+ import ctranslate2
10
+ import whisper
11
+ import gradio as gr
12
+ from argparse import Namespace
13
+
14
+ from modules.utils.paths import (FASTER_WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, UVR_MODELS_DIR, OUTPUT_DIR)
15
+ from modules.whisper.data_classes import *
16
+ from modules.whisper.base_transcription_pipeline import BaseTranscriptionPipeline
17
+
18
+
19
+ class FasterWhisperInference(BaseTranscriptionPipeline):
20
+ def __init__(self,
21
+ model_dir: str = FASTER_WHISPER_MODELS_DIR,
22
+ diarization_model_dir: str = DIARIZATION_MODELS_DIR,
23
+ uvr_model_dir: str = UVR_MODELS_DIR,
24
+ output_dir: str = OUTPUT_DIR,
25
+ ):
26
+ super().__init__(
27
+ model_dir=model_dir,
28
+ diarization_model_dir=diarization_model_dir,
29
+ uvr_model_dir=uvr_model_dir,
30
+ output_dir=output_dir
31
+ )
32
+ self.model_dir = model_dir
33
+ os.makedirs(self.model_dir, exist_ok=True)
34
+
35
+ self.model_paths = self.get_model_paths()
36
+ self.device = self.get_device()
37
+ self.available_models = self.model_paths.keys()
38
+
39
+ def transcribe(self,
40
+ audio: Union[str, BinaryIO, np.ndarray],
41
+ progress: gr.Progress = gr.Progress(),
42
+ *whisper_params,
43
+ ) -> Tuple[List[Segment], float]:
44
+ """
45
+ transcribe method for faster-whisper.
46
+
47
+ Parameters
48
+ ----------
49
+ audio: Union[str, BinaryIO, np.ndarray]
50
+ Audio path or file binary or Audio numpy array
51
+ progress: gr.Progress
52
+ Indicator to show progress directly in gradio.
53
+ *whisper_params: tuple
54
+ Parameters related with whisper. This will be dealt with "WhisperParameters" data class
55
+
56
+ Returns
57
+ ----------
58
+ segments_result: List[Segment]
59
+ list of Segment that includes start, end timestamps and transcribed text
60
+ elapsed_time: float
61
+ elapsed time for transcription
62
+ """
63
+ start_time = time.time()
64
+
65
+ params = WhisperParams.from_list(list(whisper_params))
66
+
67
+ if params.model_size != self.current_model_size or self.model is None or self.current_compute_type != params.compute_type:
68
+ self.update_model(params.model_size, params.compute_type, progress)
69
+
70
+ segments, info = self.model.transcribe(
71
+ audio=audio,
72
+ language=params.lang,
73
+ task="translate" if params.is_translate and self.current_model_size in self.translatable_models else "transcribe",
74
+ beam_size=params.beam_size,
75
+ log_prob_threshold=params.log_prob_threshold,
76
+ no_speech_threshold=params.no_speech_threshold,
77
+ best_of=params.best_of,
78
+ patience=params.patience,
79
+ temperature=params.temperature,
80
+ initial_prompt=params.initial_prompt,
81
+ compression_ratio_threshold=params.compression_ratio_threshold,
82
+ length_penalty=params.length_penalty,
83
+ repetition_penalty=params.repetition_penalty,
84
+ no_repeat_ngram_size=params.no_repeat_ngram_size,
85
+ prefix=params.prefix,
86
+ suppress_blank=params.suppress_blank,
87
+ suppress_tokens=params.suppress_tokens,
88
+ max_initial_timestamp=params.max_initial_timestamp,
89
+ word_timestamps=params.word_timestamps,
90
+ prepend_punctuations=params.prepend_punctuations,
91
+ append_punctuations=params.append_punctuations,
92
+ max_new_tokens=params.max_new_tokens,
93
+ chunk_length=params.chunk_length,
94
+ hallucination_silence_threshold=params.hallucination_silence_threshold,
95
+ hotwords=params.hotwords,
96
+ language_detection_threshold=params.language_detection_threshold,
97
+ language_detection_segments=params.language_detection_segments,
98
+ prompt_reset_on_temperature=params.prompt_reset_on_temperature,
99
+ )
100
+ progress(0, desc="Loading audio..")
101
+
102
+ segments_result = []
103
+ for segment in segments:
104
+ progress(segment.start / info.duration, desc="Transcribing..")
105
+ segments_result.append(Segment.from_faster_whisper(segment))
106
+
107
+ elapsed_time = time.time() - start_time
108
+ return segments_result, elapsed_time
109
+
110
+ def update_model(self,
111
+ model_size: str,
112
+ compute_type: str,
113
+ progress: gr.Progress = gr.Progress()
114
+ ):
115
+ """
116
+ Update current model setting
117
+
118
+ Parameters
119
+ ----------
120
+ model_size: str
121
+ Size of whisper model
122
+ compute_type: str
123
+ Compute type for transcription.
124
+ see more info : https://opennmt.net/CTranslate2/quantization.html
125
+ progress: gr.Progress
126
+ Indicator to show progress directly in gradio.
127
+ """
128
+ progress(0, desc="Initializing Model..")
129
+ self.current_model_size = self.model_paths[model_size]
130
+ self.current_compute_type = compute_type
131
+ self.model = faster_whisper.WhisperModel(
132
+ device=self.device,
133
+ model_size_or_path=self.current_model_size,
134
+ download_root=self.model_dir,
135
+ compute_type=self.current_compute_type
136
+ )
137
+
138
+ def get_model_paths(self):
139
+ """
140
+ Get available models from models path including fine-tuned model.
141
+
142
+ Returns
143
+ ----------
144
+ Name list of models
145
+ """
146
+ model_paths = {model:model for model in faster_whisper.available_models()}
147
+ faster_whisper_prefix = "models--Systran--faster-whisper-"
148
+
149
+ existing_models = os.listdir(self.model_dir)
150
+ wrong_dirs = [".locks"]
151
+ existing_models = list(set(existing_models) - set(wrong_dirs))
152
+
153
+ for model_name in existing_models:
154
+ if faster_whisper_prefix in model_name:
155
+ model_name = model_name[len(faster_whisper_prefix):]
156
+
157
+ if model_name not in whisper.available_models():
158
+ model_paths[model_name] = os.path.join(self.model_dir, model_name)
159
+ return model_paths
160
+
161
+ @staticmethod
162
+ def get_device():
163
+ if torch.cuda.is_available():
164
+ return "cuda"
165
+ else:
166
+ return "auto"
167
+
168
+ @staticmethod
169
+ def format_suppress_tokens_str(suppress_tokens_str: str) -> List[int]:
170
+ try:
171
+ suppress_tokens = ast.literal_eval(suppress_tokens_str)
172
+ if not isinstance(suppress_tokens, list) or not all(isinstance(item, int) for item in suppress_tokens):
173
+ raise ValueError("Invalid Suppress Tokens. The value must be type of List[int]")
174
+ return suppress_tokens
175
+ except Exception as e:
176
+ raise ValueError("Invalid Suppress Tokens. The value must be type of List[int]")
modules/whisper/insanely_fast_whisper_inference.py ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+ import numpy as np
4
+ from typing import BinaryIO, Union, Tuple, List
5
+ import torch
6
+ from transformers import pipeline
7
+ from transformers.utils import is_flash_attn_2_available
8
+ import gradio as gr
9
+ from huggingface_hub import hf_hub_download
10
+ import whisper
11
+ from rich.progress import Progress, TimeElapsedColumn, BarColumn, TextColumn
12
+ from argparse import Namespace
13
+
14
+ from modules.utils.paths import (INSANELY_FAST_WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, UVR_MODELS_DIR, OUTPUT_DIR)
15
+ from modules.whisper.data_classes import *
16
+ from modules.whisper.base_transcription_pipeline import BaseTranscriptionPipeline
17
+
18
+
19
+ class InsanelyFastWhisperInference(BaseTranscriptionPipeline):
20
+ def __init__(self,
21
+ model_dir: str = INSANELY_FAST_WHISPER_MODELS_DIR,
22
+ diarization_model_dir: str = DIARIZATION_MODELS_DIR,
23
+ uvr_model_dir: str = UVR_MODELS_DIR,
24
+ output_dir: str = OUTPUT_DIR,
25
+ ):
26
+ super().__init__(
27
+ model_dir=model_dir,
28
+ output_dir=output_dir,
29
+ diarization_model_dir=diarization_model_dir,
30
+ uvr_model_dir=uvr_model_dir
31
+ )
32
+ self.model_dir = model_dir
33
+ os.makedirs(self.model_dir, exist_ok=True)
34
+
35
+ self.available_models = self.get_model_paths()
36
+
37
+ def transcribe(self,
38
+ audio: Union[str, np.ndarray, torch.Tensor],
39
+ progress: gr.Progress = gr.Progress(),
40
+ *whisper_params,
41
+ ) -> Tuple[List[Segment], float]:
42
+ """
43
+ transcribe method for faster-whisper.
44
+
45
+ Parameters
46
+ ----------
47
+ audio: Union[str, BinaryIO, np.ndarray]
48
+ Audio path or file binary or Audio numpy array
49
+ progress: gr.Progress
50
+ Indicator to show progress directly in gradio.
51
+ *whisper_params: tuple
52
+ Parameters related with whisper. This will be dealt with "WhisperParameters" data class
53
+
54
+ Returns
55
+ ----------
56
+ segments_result: List[Segment]
57
+ list of Segment that includes start, end timestamps and transcribed text
58
+ elapsed_time: float
59
+ elapsed time for transcription
60
+ """
61
+ start_time = time.time()
62
+ params = WhisperParams.from_list(list(whisper_params))
63
+
64
+ if params.model_size != self.current_model_size or self.model is None or self.current_compute_type != params.compute_type:
65
+ self.update_model(params.model_size, params.compute_type, progress)
66
+
67
+ progress(0, desc="Transcribing...Progress is not shown in insanely-fast-whisper.")
68
+ with Progress(
69
+ TextColumn("[progress.description]{task.description}"),
70
+ BarColumn(style="yellow1", pulse_style="white"),
71
+ TimeElapsedColumn(),
72
+ ) as progress:
73
+ progress.add_task("[yellow]Transcribing...", total=None)
74
+
75
+ kwargs = {
76
+ "no_speech_threshold": params.no_speech_threshold,
77
+ "temperature": params.temperature,
78
+ "compression_ratio_threshold": params.compression_ratio_threshold,
79
+ "logprob_threshold": params.log_prob_threshold,
80
+ }
81
+
82
+ if self.current_model_size.endswith(".en"):
83
+ pass
84
+ else:
85
+ kwargs["language"] = params.lang
86
+ kwargs["task"] = "translate" if params.is_translate else "transcribe"
87
+
88
+ segments = self.model(
89
+ inputs=audio,
90
+ return_timestamps=True,
91
+ chunk_length_s=params.chunk_length,
92
+ batch_size=params.batch_size,
93
+ generate_kwargs=kwargs
94
+ )
95
+
96
+ segments_result = []
97
+ for item in segments["chunks"]:
98
+ start, end = item["timestamp"][0], item["timestamp"][1]
99
+ if end is None:
100
+ end = start
101
+ segments_result.append(Segment(
102
+ text=item["text"],
103
+ start=start,
104
+ end=end
105
+ ))
106
+
107
+ elapsed_time = time.time() - start_time
108
+ return segments_result, elapsed_time
109
+
110
+ def update_model(self,
111
+ model_size: str,
112
+ compute_type: str,
113
+ progress: gr.Progress = gr.Progress(),
114
+ ):
115
+ """
116
+ Update current model setting
117
+
118
+ Parameters
119
+ ----------
120
+ model_size: str
121
+ Size of whisper model
122
+ compute_type: str
123
+ Compute type for transcription.
124
+ see more info : https://opennmt.net/CTranslate2/quantization.html
125
+ progress: gr.Progress
126
+ Indicator to show progress directly in gradio.
127
+ """
128
+ progress(0, desc="Initializing Model..")
129
+ model_path = os.path.join(self.model_dir, model_size)
130
+ if not os.path.isdir(model_path) or not os.listdir(model_path):
131
+ self.download_model(
132
+ model_size=model_size,
133
+ download_root=model_path,
134
+ progress=progress
135
+ )
136
+
137
+ self.current_compute_type = compute_type
138
+ self.current_model_size = model_size
139
+ self.model = pipeline(
140
+ "automatic-speech-recognition",
141
+ model=os.path.join(self.model_dir, model_size),
142
+ torch_dtype=self.current_compute_type,
143
+ device=self.device,
144
+ model_kwargs={"attn_implementation": "flash_attention_2"} if is_flash_attn_2_available() else {"attn_implementation": "sdpa"},
145
+ )
146
+
147
+ def get_model_paths(self):
148
+ """
149
+ Get available models from models path including fine-tuned model.
150
+
151
+ Returns
152
+ ----------
153
+ Name set of models
154
+ """
155
+ openai_models = whisper.available_models()
156
+ distil_models = ["distil-large-v2", "distil-large-v3", "distil-medium.en", "distil-small.en"]
157
+ default_models = openai_models + distil_models
158
+
159
+ existing_models = os.listdir(self.model_dir)
160
+ wrong_dirs = [".locks"]
161
+
162
+ available_models = default_models + existing_models
163
+ available_models = [model for model in available_models if model not in wrong_dirs]
164
+ available_models = sorted(set(available_models), key=available_models.index)
165
+
166
+ return available_models
167
+
168
+ @staticmethod
169
+ def download_model(
170
+ model_size: str,
171
+ download_root: str,
172
+ progress: gr.Progress
173
+ ):
174
+ progress(0, 'Initializing model..')
175
+ print(f'Downloading {model_size} to "{download_root}"....')
176
+
177
+ os.makedirs(download_root, exist_ok=True)
178
+ download_list = [
179
+ "model.safetensors",
180
+ "config.json",
181
+ "generation_config.json",
182
+ "preprocessor_config.json",
183
+ "tokenizer.json",
184
+ "tokenizer_config.json",
185
+ "added_tokens.json",
186
+ "special_tokens_map.json",
187
+ "vocab.json",
188
+ ]
189
+
190
+ if model_size.startswith("distil"):
191
+ repo_id = f"distil-whisper/{model_size}"
192
+ else:
193
+ repo_id = f"openai/whisper-{model_size}"
194
+ for item in download_list:
195
+ hf_hub_download(repo_id=repo_id, filename=item, local_dir=download_root)
modules/whisper/whisper_Inference.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import whisper
2
+ import gradio as gr
3
+ import time
4
+ from typing import BinaryIO, Union, Tuple, List
5
+ import numpy as np
6
+ import torch
7
+ import os
8
+ from argparse import Namespace
9
+
10
+ from modules.utils.paths import (WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR, UVR_MODELS_DIR)
11
+ from modules.whisper.base_transcription_pipeline import BaseTranscriptionPipeline
12
+ from modules.whisper.data_classes import *
13
+
14
+
15
+ class WhisperInference(BaseTranscriptionPipeline):
16
+ def __init__(self,
17
+ model_dir: str = WHISPER_MODELS_DIR,
18
+ diarization_model_dir: str = DIARIZATION_MODELS_DIR,
19
+ uvr_model_dir: str = UVR_MODELS_DIR,
20
+ output_dir: str = OUTPUT_DIR,
21
+ ):
22
+ super().__init__(
23
+ model_dir=model_dir,
24
+ output_dir=output_dir,
25
+ diarization_model_dir=diarization_model_dir,
26
+ uvr_model_dir=uvr_model_dir
27
+ )
28
+
29
+ def transcribe(self,
30
+ audio: Union[str, np.ndarray, torch.Tensor],
31
+ progress: gr.Progress = gr.Progress(),
32
+ *whisper_params,
33
+ ) -> Tuple[List[Segment], float]:
34
+ """
35
+ transcribe method for faster-whisper.
36
+
37
+ Parameters
38
+ ----------
39
+ audio: Union[str, BinaryIO, np.ndarray]
40
+ Audio path or file binary or Audio numpy array
41
+ progress: gr.Progress
42
+ Indicator to show progress directly in gradio.
43
+ *whisper_params: tuple
44
+ Parameters related with whisper. This will be dealt with "WhisperParameters" data class
45
+
46
+ Returns
47
+ ----------
48
+ segments_result: List[Segment]
49
+ list of Segment that includes start, end timestamps and transcribed text
50
+ elapsed_time: float
51
+ elapsed time for transcription
52
+ """
53
+ start_time = time.time()
54
+ params = WhisperParams.from_list(list(whisper_params))
55
+
56
+ if params.model_size != self.current_model_size or self.model is None or self.current_compute_type != params.compute_type:
57
+ self.update_model(params.model_size, params.compute_type, progress)
58
+
59
+ def progress_callback(progress_value):
60
+ progress(progress_value, desc="Transcribing..")
61
+
62
+ result = self.model.transcribe(audio=audio,
63
+ language=params.lang,
64
+ verbose=False,
65
+ beam_size=params.beam_size,
66
+ logprob_threshold=params.log_prob_threshold,
67
+ no_speech_threshold=params.no_speech_threshold,
68
+ task="translate" if params.is_translate and self.current_model_size in self.translatable_models else "transcribe",
69
+ fp16=True if params.compute_type == "float16" else False,
70
+ best_of=params.best_of,
71
+ patience=params.patience,
72
+ temperature=params.temperature,
73
+ compression_ratio_threshold=params.compression_ratio_threshold,
74
+ progress_callback=progress_callback,)["segments"]
75
+ segments_result = []
76
+ for segment in result:
77
+ segments_result.append(Segment(
78
+ start=segment["start"],
79
+ end=segment["end"],
80
+ text=segment["text"]
81
+ ))
82
+
83
+ elapsed_time = time.time() - start_time
84
+ return segments_result, elapsed_time
85
+
86
+ def update_model(self,
87
+ model_size: str,
88
+ compute_type: str,
89
+ progress: gr.Progress = gr.Progress(),
90
+ ):
91
+ """
92
+ Update current model setting
93
+
94
+ Parameters
95
+ ----------
96
+ model_size: str
97
+ Size of whisper model
98
+ compute_type: str
99
+ Compute type for transcription.
100
+ see more info : https://opennmt.net/CTranslate2/quantization.html
101
+ progress: gr.Progress
102
+ Indicator to show progress directly in gradio.
103
+ """
104
+ progress(0, desc="Initializing Model..")
105
+ self.current_compute_type = compute_type
106
+ self.current_model_size = model_size
107
+ self.model = whisper.load_model(
108
+ name=model_size,
109
+ device=self.device,
110
+ download_root=self.model_dir
111
+ )
modules/whisper/whisper_factory.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional
2
+ import os
3
+
4
+ from modules.utils.paths import (FASTER_WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR,
5
+ INSANELY_FAST_WHISPER_MODELS_DIR, WHISPER_MODELS_DIR, UVR_MODELS_DIR)
6
+ from modules.whisper.faster_whisper_inference import FasterWhisperInference
7
+ from modules.whisper.whisper_Inference import WhisperInference
8
+ from modules.whisper.insanely_fast_whisper_inference import InsanelyFastWhisperInference
9
+ from modules.whisper.base_transcription_pipeline import BaseTranscriptionPipeline
10
+ from modules.whisper.data_classes import *
11
+
12
+
13
+ class WhisperFactory:
14
+ @staticmethod
15
+ def create_whisper_inference(
16
+ whisper_type: str,
17
+ whisper_model_dir: str = WHISPER_MODELS_DIR,
18
+ faster_whisper_model_dir: str = FASTER_WHISPER_MODELS_DIR,
19
+ insanely_fast_whisper_model_dir: str = INSANELY_FAST_WHISPER_MODELS_DIR,
20
+ diarization_model_dir: str = DIARIZATION_MODELS_DIR,
21
+ uvr_model_dir: str = UVR_MODELS_DIR,
22
+ output_dir: str = OUTPUT_DIR,
23
+ ) -> "BaseTranscriptionPipeline":
24
+ """
25
+ Create a whisper inference class based on the provided whisper_type.
26
+
27
+ Parameters
28
+ ----------
29
+ whisper_type : str
30
+ The type of Whisper implementation to use. Supported values (case-insensitive):
31
+ - "faster-whisper": https://github.com/openai/whisper
32
+ - "whisper": https://github.com/openai/whisper
33
+ - "insanely-fast-whisper": https://github.com/Vaibhavs10/insanely-fast-whisper
34
+ whisper_model_dir : str
35
+ Directory path for the Whisper model.
36
+ faster_whisper_model_dir : str
37
+ Directory path for the Faster Whisper model.
38
+ insanely_fast_whisper_model_dir : str
39
+ Directory path for the Insanely Fast Whisper model.
40
+ diarization_model_dir : str
41
+ Directory path for the diarization model.
42
+ uvr_model_dir : str
43
+ Directory path for the UVR model.
44
+ output_dir : str
45
+ Directory path where output files will be saved.
46
+
47
+ Returns
48
+ -------
49
+ BaseTranscriptionPipeline
50
+ An instance of the appropriate whisper inference class based on the whisper_type.
51
+ """
52
+ # Temporal fix of the bug : https://github.com/jhj0517/Whisper-WebUI/issues/144
53
+ os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
54
+
55
+ whisper_type = whisper_type.strip().lower()
56
+
57
+ if whisper_type == WhisperImpl.FASTER_WHISPER.value:
58
+ return FasterWhisperInference(
59
+ model_dir=faster_whisper_model_dir,
60
+ output_dir=output_dir,
61
+ diarization_model_dir=diarization_model_dir,
62
+ uvr_model_dir=uvr_model_dir
63
+ )
64
+ elif whisper_type == WhisperImpl.WHISPER.value:
65
+ return WhisperInference(
66
+ model_dir=whisper_model_dir,
67
+ output_dir=output_dir,
68
+ diarization_model_dir=diarization_model_dir,
69
+ uvr_model_dir=uvr_model_dir
70
+ )
71
+ elif whisper_type == WhisperImpl.INSANELY_FAST_WHISPER.value:
72
+ return InsanelyFastWhisperInference(
73
+ model_dir=insanely_fast_whisper_model_dir,
74
+ output_dir=output_dir,
75
+ diarization_model_dir=diarization_model_dir,
76
+ uvr_model_dir=uvr_model_dir
77
+ )
78
+ else:
79
+ return FasterWhisperInference(
80
+ model_dir=faster_whisper_model_dir,
81
+ output_dir=output_dir,
82
+ diarization_model_dir=diarization_model_dir,
83
+ uvr_model_dir=uvr_model_dir
84
+ )
notebook/whisper-webui.ipynb ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "source": [
6
+ "---\n",
7
+ "\n",
8
+ "📌 **This notebook has been updated [here](https://github.com/jhj0517/Whisper-WebUI.git)!**\n",
9
+ "\n",
10
+ "🖋 **Author**: [jhj0517](https://github.com/jhj0517/Whisper-WebUI/blob/master/notebook/whisper-webui.ipynb)\n",
11
+ "\n",
12
+ "😎 **Support the Project**:\n",
13
+ "\n",
14
+ "If you find this project useful, please consider supporting it:\n",
15
+ "\n",
16
+ "<a href=\"https://ko-fi.com/jhj0517\" target=\"_blank\">\n",
17
+ " <img src=\"https://storage.ko-fi.com/cdn/kofi2.png?v=3\" alt=\"Buy Me a Coffee at ko-fi.com\" height=\"36\">\n",
18
+ "</a>\n",
19
+ "\n",
20
+ "---"
21
+ ],
22
+ "metadata": {
23
+ "id": "doKhBBXIfS21"
24
+ }
25
+ },
26
+ {
27
+ "cell_type": "code",
28
+ "source": [
29
+ "#@title #(Optional) Check GPU\n",
30
+ "#@markdown Some models may not function correctly on a CPU runtime.\n",
31
+ "\n",
32
+ "#@markdown so you should check your GPU setup before run.\n",
33
+ "!nvidia-smi"
34
+ ],
35
+ "metadata": {
36
+ "id": "23yZvUlagEsx",
37
+ "cellView": "form"
38
+ },
39
+ "execution_count": null,
40
+ "outputs": []
41
+ },
42
+ {
43
+ "cell_type": "code",
44
+ "execution_count": null,
45
+ "metadata": {
46
+ "id": "kNbSbsctxahq",
47
+ "cellView": "form"
48
+ },
49
+ "outputs": [],
50
+ "source": [
51
+ "#@title #Installation\n",
52
+ "#@markdown This cell will install dependencies for Whisper-WebUI!\n",
53
+ "!git clone https://github.com/jhj0517/Whisper-WebUI.git\n",
54
+ "%cd Whisper-WebUI\n",
55
+ "!pip install git+https://github.com/jhj0517/jhj0517-whisper.git\n",
56
+ "!pip install faster-whisper==1.0.3\n",
57
+ "!pip install ctranslate2==4.4.0\n",
58
+ "!pip install gradio\n",
59
+ "!pip install gradio-i18n\n",
60
+ "# Temporal bug fix from https://github.com/jhj0517/Whisper-WebUI/issues/256\n",
61
+ "!pip install git+https://github.com/JuanBindez/pytubefix.git\n",
62
+ "!pip install tokenizers==0.19.1\n",
63
+ "!pip install pyannote.audio==3.3.1\n",
64
+ "!pip install git+https://github.com/jhj0517/ultimatevocalremover_api.git"
65
+ ]
66
+ },
67
+ {
68
+ "cell_type": "code",
69
+ "source": [
70
+ "#@title # (Optional) Configure arguments\n",
71
+ "#@markdown This section is used to configure some command line arguments.\n",
72
+ "\n",
73
+ "#@markdown You can simply ignore this section and the default values will be used.\n",
74
+ "\n",
75
+ "USERNAME = '' #@param {type: \"string\"}\n",
76
+ "PASSWORD = '' #@param {type: \"string\"}\n",
77
+ "WHISPER_TYPE = 'faster-whisper' # @param [\"whisper\", \"faster-whisper\", \"insanely-fast-whisper\"]\n",
78
+ "THEME = '' #@param {type: \"string\"}\n",
79
+ "\n",
80
+ "arguments = \"\"\n",
81
+ "if USERNAME:\n",
82
+ " arguments += f\" --username {USERNAME}\"\n",
83
+ "if PASSWORD:\n",
84
+ " arguments += f\" --password {PASSWORD}\"\n",
85
+ "if THEME:\n",
86
+ " arguments += f\" --theme {THEME}\"\n",
87
+ "if WHISPER_TYPE:\n",
88
+ " arguments += f\" --whisper_type {WHISPER_TYPE}\"\n",
89
+ "\n",
90
+ "\n",
91
+ "#@markdown If you wonder how these arguments are used, you can see the [Wiki](https://github.com/jhj0517/Whisper-WebUI/wiki/Command-Line-Arguments)."
92
+ ],
93
+ "metadata": {
94
+ "id": "Qosz9BFlGui3",
95
+ "cellView": "form"
96
+ },
97
+ "execution_count": null,
98
+ "outputs": []
99
+ },
100
+ {
101
+ "cell_type": "code",
102
+ "execution_count": 3,
103
+ "metadata": {
104
+ "id": "PQroYRRZzQiN",
105
+ "cellView": "form"
106
+ },
107
+ "outputs": [],
108
+ "source": [
109
+ "#@title #Run\n",
110
+ "#@markdown Once the installation is complete, you can use public URL that is displayed.\n",
111
+ "if 'arguments' in locals():\n",
112
+ " !python app.py --share --colab{arguments}\n",
113
+ "else:\n",
114
+ " !python app.py --share --colab"
115
+ ]
116
+ }
117
+ ],
118
+ "metadata": {
119
+ "colab": {
120
+ "provenance": [],
121
+ "gpuType": "T4"
122
+ },
123
+ "kernelspec": {
124
+ "display_name": "Python 3",
125
+ "name": "python3"
126
+ },
127
+ "language_info": {
128
+ "name": "python"
129
+ },
130
+ "accelerator": "GPU"
131
+ },
132
+ "nbformat": 4,
133
+ "nbformat_minor": 0
134
+ }
requirements.txt ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Remove the --extra-index-url line below if you're not using Nvidia GPU.
2
+ # If you're using it, update url to your CUDA version (CUDA 12.1 is minimum requirement):
3
+ # For CUDA 12.1, use : https://download.pytorch.org/whl/cu121
4
+ # For CUDA 12.4, use : https://download.pytorch.org/whl/cu124
5
+ --extra-index-url https://download.pytorch.org/whl/cu124
6
+
7
+ gradio==4.44.1
8
+ torch
9
+ torchaudio
10
+ git+https://github.com/jhj0517/jhj0517-whisper.git
11
+ faster-whisper==1.0.3
12
+ transformers
13
+ gradio-i18n
14
+ pytubefix
15
+ ruamel.yaml==0.18.6
16
+ pyannote.audio==3.3.1
17
+ git+https://github.com/jhj0517/ultimatevocalremover_api.git
18
+ git+https://github.com/jhj0517/pyrubberband.git
start-webui.bat ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ @echo off
2
+
3
+ call venv\scripts\activate
4
+ python app.py %*
5
+
6
+ echo "launching the app"
7
+ pause
start-webui.sh ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ source venv/bin/activate
4
+ python app.py "$@"
5
+
6
+ echo "launching the app"
tests/test_bgm_separation.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from modules.utils.paths import *
2
+ from modules.whisper.whisper_factory import WhisperFactory
3
+ from modules.whisper.data_classes import *
4
+ from test_config import *
5
+ from test_transcription import download_file, test_transcribe
6
+
7
+ import gradio as gr
8
+ import pytest
9
+ import torch
10
+ import os
11
+
12
+
13
+ @pytest.mark.skipif(
14
+ not is_cuda_available(),
15
+ reason="Skipping because the test only works on GPU"
16
+ )
17
+ @pytest.mark.parametrize(
18
+ "whisper_type,vad_filter,bgm_separation,diarization",
19
+ [
20
+ (WhisperImpl.WHISPER.value, False, True, False),
21
+ (WhisperImpl.FASTER_WHISPER.value, False, True, False),
22
+ (WhisperImpl.INSANELY_FAST_WHISPER.value, False, True, False)
23
+ ]
24
+ )
25
+ def test_bgm_separation_pipeline(
26
+ whisper_type: str,
27
+ vad_filter: bool,
28
+ bgm_separation: bool,
29
+ diarization: bool,
30
+ ):
31
+ test_transcribe(whisper_type, vad_filter, bgm_separation, diarization)
32
+
33
+
34
+ @pytest.mark.skipif(
35
+ not is_cuda_available(),
36
+ reason="Skipping because the test only works on GPU"
37
+ )
38
+ @pytest.mark.parametrize(
39
+ "whisper_type,vad_filter,bgm_separation,diarization",
40
+ [
41
+ (WhisperImpl.WHISPER.value, True, True, False),
42
+ (WhisperImpl.FASTER_WHISPER.value, True, True, False),
43
+ (WhisperImpl.INSANELY_FAST_WHISPER.value, True, True, False)
44
+ ]
45
+ )
46
+ def test_bgm_separation_with_vad_pipeline(
47
+ whisper_type: str,
48
+ vad_filter: bool,
49
+ bgm_separation: bool,
50
+ diarization: bool,
51
+ ):
52
+ test_transcribe(whisper_type, vad_filter, bgm_separation, diarization)
53
+
tests/test_config.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import functools
2
+ import jiwer
3
+ import os
4
+ import torch
5
+
6
+ from modules.utils.paths import *
7
+ from modules.utils.youtube_manager import *
8
+
9
+ TEST_FILE_DOWNLOAD_URL = "https://github.com/jhj0517/whisper_flutter_new/raw/main/example/assets/jfk.wav"
10
+ TEST_FILE_PATH = os.path.join(WEBUI_DIR, "tests", "jfk.wav")
11
+ TEST_ANSWER = "And so my fellow Americans ask not what your country can do for you ask what you can do for your country"
12
+ TEST_YOUTUBE_URL = "https://www.youtube.com/watch?v=4WEQtgnBu0I&ab_channel=AndriaFitzer"
13
+ TEST_WHISPER_MODEL = "tiny"
14
+ TEST_UVR_MODEL = "UVR-MDX-NET-Inst_HQ_4"
15
+ TEST_NLLB_MODEL = "facebook/nllb-200-distilled-600M"
16
+ TEST_SUBTITLE_SRT_PATH = os.path.join(WEBUI_DIR, "tests", "test_srt.srt")
17
+ TEST_SUBTITLE_VTT_PATH = os.path.join(WEBUI_DIR, "tests", "test_vtt.vtt")
18
+
19
+
20
+ @functools.lru_cache
21
+ def is_cuda_available():
22
+ return torch.cuda.is_available()
23
+
24
+
25
+ @functools.lru_cache
26
+ def is_pytube_detected_bot(url: str = TEST_YOUTUBE_URL):
27
+ try:
28
+ yt_temp_path = os.path.join("modules", "yt_tmp.wav")
29
+ if os.path.exists(yt_temp_path):
30
+ return False
31
+ yt = get_ytdata(url)
32
+ audio = get_ytaudio(yt)
33
+ return False
34
+ except Exception as e:
35
+ print(f"Pytube has detected as a bot: {e}")
36
+ return True
37
+
38
+
39
+ def calculate_wer(answer, prediction):
40
+ return jiwer.wer(answer, prediction)
tests/test_diarization.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from modules.utils.paths import *
2
+ from modules.whisper.whisper_factory import WhisperFactory
3
+ from modules.whisper.data_classes import *
4
+ from test_config import *
5
+ from test_transcription import download_file, test_transcribe
6
+
7
+ import gradio as gr
8
+ import pytest
9
+ import os
10
+
11
+
12
+ @pytest.mark.skipif(
13
+ not is_cuda_available(),
14
+ reason="Skipping because the test only works on GPU"
15
+ )
16
+ @pytest.mark.parametrize(
17
+ "whisper_type,vad_filter,bgm_separation,diarization",
18
+ [
19
+ (WhisperImpl.WHISPER.value, False, False, True),
20
+ (WhisperImpl.FASTER_WHISPER.value, False, False, True),
21
+ (WhisperImpl.INSANELY_FAST_WHISPER.value, False, False, True)
22
+ ]
23
+ )
24
+ def test_diarization_pipeline(
25
+ whisper_type: str,
26
+ vad_filter: bool,
27
+ bgm_separation: bool,
28
+ diarization: bool,
29
+ ):
30
+ test_transcribe(whisper_type, vad_filter, bgm_separation, diarization)
31
+
tests/test_srt.srt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ 1
2
+ 00:00:00,000 --> 00:00:02,240
3
+ You've got
4
+
5
+ 2
6
+ 00:00:02,240 --> 00:00:04,160
7
+ a friend in me.
tests/test_transcription.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from modules.whisper.whisper_factory import WhisperFactory
2
+ from modules.whisper.data_classes import *
3
+ from modules.utils.subtitle_manager import read_file
4
+ from modules.utils.paths import WEBUI_DIR
5
+ from test_config import *
6
+
7
+ import requests
8
+ import pytest
9
+ import gradio as gr
10
+ import os
11
+
12
+
13
+ @pytest.mark.parametrize(
14
+ "whisper_type,vad_filter,bgm_separation,diarization",
15
+ [
16
+ (WhisperImpl.WHISPER.value, False, False, False),
17
+ (WhisperImpl.FASTER_WHISPER.value, False, False, False),
18
+ (WhisperImpl.INSANELY_FAST_WHISPER.value, False, False, False)
19
+ ]
20
+ )
21
+ def test_transcribe(
22
+ whisper_type: str,
23
+ vad_filter: bool,
24
+ bgm_separation: bool,
25
+ diarization: bool,
26
+ ):
27
+ audio_path_dir = os.path.join(WEBUI_DIR, "tests")
28
+ audio_path = os.path.join(audio_path_dir, "jfk.wav")
29
+ if not os.path.exists(audio_path):
30
+ download_file(TEST_FILE_DOWNLOAD_URL, audio_path_dir)
31
+
32
+ answer = TEST_ANSWER
33
+ if diarization:
34
+ answer = "SPEAKER_00|"+TEST_ANSWER
35
+
36
+ whisper_inferencer = WhisperFactory.create_whisper_inference(
37
+ whisper_type=whisper_type,
38
+ )
39
+ print(
40
+ f"""Whisper Device : {whisper_inferencer.device}\n"""
41
+ f"""BGM Separation Device: {whisper_inferencer.music_separator.device}\n"""
42
+ f"""Diarization Device: {whisper_inferencer.diarizer.device}"""
43
+ )
44
+
45
+ hparams = TranscriptionPipelineParams(
46
+ whisper=WhisperParams(
47
+ model_size=TEST_WHISPER_MODEL,
48
+ compute_type=whisper_inferencer.current_compute_type
49
+ ),
50
+ vad=VadParams(
51
+ vad_filter=vad_filter
52
+ ),
53
+ bgm_separation=BGMSeparationParams(
54
+ is_separate_bgm=bgm_separation,
55
+ enable_offload=True
56
+ ),
57
+ diarization=DiarizationParams(
58
+ is_diarize=diarization
59
+ ),
60
+ ).to_list()
61
+
62
+ subtitle_str, file_paths = whisper_inferencer.transcribe_file(
63
+ [audio_path],
64
+ None,
65
+ "SRT",
66
+ False,
67
+ gr.Progress(),
68
+ *hparams,
69
+ )
70
+ subtitle = read_file(file_paths[0]).split("\n")
71
+ assert calculate_wer(answer, subtitle[2].strip().replace(",", "").replace(".", "")) < 0.1
72
+
73
+ if not is_pytube_detected_bot():
74
+ subtitle_str, file_path = whisper_inferencer.transcribe_youtube(
75
+ TEST_YOUTUBE_URL,
76
+ "SRT",
77
+ False,
78
+ gr.Progress(),
79
+ *hparams,
80
+ )
81
+ assert isinstance(subtitle_str, str) and subtitle_str
82
+ assert os.path.exists(file_path)
83
+
84
+ subtitle_str, file_path = whisper_inferencer.transcribe_mic(
85
+ audio_path,
86
+ "SRT",
87
+ False,
88
+ gr.Progress(),
89
+ *hparams,
90
+ )
91
+ subtitle = read_file(file_path).split("\n")
92
+ assert calculate_wer(answer, subtitle[2].strip().replace(",", "").replace(".", "")) < 0.1
93
+
94
+
95
+ def download_file(url, save_dir):
96
+ if os.path.exists(TEST_FILE_PATH):
97
+ return
98
+
99
+ if not os.path.exists(save_dir):
100
+ os.makedirs(save_dir)
101
+
102
+ file_name = url.split("/")[-1]
103
+ file_path = os.path.join(save_dir, file_name)
104
+
105
+ response = requests.get(url)
106
+
107
+ with open(file_path, "wb") as file:
108
+ file.write(response.content)
109
+
110
+ print(f"File downloaded to: {file_path}")
tests/test_translation.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from modules.translation.deepl_api import DeepLAPI
2
+ from modules.translation.nllb_inference import NLLBInference
3
+ from test_config import *
4
+
5
+ import os
6
+ import pytest
7
+
8
+
9
+ @pytest.mark.parametrize("model_size, file_path", [
10
+ (TEST_NLLB_MODEL, TEST_SUBTITLE_SRT_PATH),
11
+ (TEST_NLLB_MODEL, TEST_SUBTITLE_VTT_PATH),
12
+ ])
13
+ def test_nllb_inference(
14
+ model_size: str,
15
+ file_path: str
16
+ ):
17
+ nllb_inferencer = NLLBInference()
18
+ print(f"NLLB Device : {nllb_inferencer.device}")
19
+
20
+ result_str, file_paths = nllb_inferencer.translate_file(
21
+ fileobjs=[file_path],
22
+ model_size=model_size,
23
+ src_lang="eng_Latn",
24
+ tgt_lang="kor_Hang",
25
+ )
26
+
27
+ assert isinstance(result_str, str)
28
+ assert isinstance(file_paths[0], str)
29
+
30
+
31
+ @pytest.mark.skipif(
32
+ os.getenv("DEEPL_API_KEY") is None or not os.getenv("DEEPL_API_KEY"),
33
+ reason="DeepL API key is unavailable"
34
+ )
35
+ @pytest.mark.parametrize("file_path", [
36
+ TEST_SUBTITLE_SRT_PATH,
37
+ TEST_SUBTITLE_VTT_PATH,
38
+ ])
39
+ def test_deepl_api(
40
+ file_path: str
41
+ ):
42
+ deepl_api = DeepLAPI()
43
+
44
+ api_key = os.getenv("DEEPL_API_KEY")
45
+
46
+ result_str, file_paths = deepl_api.translate_deepl(
47
+ auth_key=api_key,
48
+ fileobjs=[file_path],
49
+ source_lang="English",
50
+ target_lang="Korean",
51
+ is_pro=False,
52
+ add_timestamp=True,
53
+ )
54
+
55
+ assert isinstance(result_str, str)
56
+ assert isinstance(file_paths[0], str)
tests/test_vad.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from modules.utils.paths import *
2
+ from modules.whisper.whisper_factory import WhisperFactory
3
+ from modules.whisper.data_classes import *
4
+ from test_config import *
5
+ from test_transcription import download_file, test_transcribe
6
+
7
+ import gradio as gr
8
+ import pytest
9
+ import os
10
+
11
+
12
+ @pytest.mark.parametrize(
13
+ "whisper_type,vad_filter,bgm_separation,diarization",
14
+ [
15
+ (WhisperImpl.WHISPER.value, True, False, False),
16
+ (WhisperImpl.FASTER_WHISPER.value, True, False, False),
17
+ (WhisperImpl.INSANELY_FAST_WHISPER.value, True, False, False)
18
+ ]
19
+ )
20
+ def test_vad_pipeline(
21
+ whisper_type: str,
22
+ vad_filter: bool,
23
+ bgm_separation: bool,
24
+ diarization: bool,
25
+ ):
26
+ test_transcribe(whisper_type, vad_filter, bgm_separation, diarization)