Spaces:

datasea
/

Whisper-WebUI

Runtime error

App Files Files Community

root commited on Jun 6, 2025

Commit

08ce36d

0 Parent(s):

Clean init push

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.dockerignore +10 -0
.gitignore +14 -0
.no_build +0 -0
Dockerfile +34 -0
Install.bat +21 -0
Install.sh +18 -0
LICENSE +201 -0
README.md +130 -0
app.py +352 -0
configs/translation.yaml +459 -0
docker-compose.yaml +29 -0
modules/__init__.py +0 -0
modules/diarize/__init__.py +0 -0
modules/diarize/audio_loader.py +179 -0
modules/diarize/diarize_pipeline.py +98 -0
modules/diarize/diarizer.py +145 -0
modules/translation/__init__.py +0 -0
modules/translation/deepl_api.py +217 -0
modules/translation/nllb_inference.py +289 -0
modules/translation/translation_base.py +181 -0
modules/ui/__init__.py +0 -0
modules/ui/htmls.py +97 -0
modules/utils/__init__.py +0 -0
modules/utils/cli_manager.py +12 -0
modules/utils/constants.py +6 -0
modules/utils/files_manager.py +75 -0
modules/utils/paths.py +32 -0
modules/utils/subtitle_manager.py +438 -0
modules/utils/youtube_manager.py +33 -0
modules/uvr/music_separator.py +183 -0
modules/vad/__init__.py +0 -0
modules/vad/silero_vad.py +265 -0
modules/whisper/__init__.py +0 -0
modules/whisper/base_transcription_pipeline.py +563 -0
modules/whisper/data_classes.py +608 -0
modules/whisper/faster_whisper_inference.py +176 -0
modules/whisper/insanely_fast_whisper_inference.py +195 -0
modules/whisper/whisper_Inference.py +111 -0
modules/whisper/whisper_factory.py +84 -0
notebook/whisper-webui.ipynb +134 -0
requirements.txt +18 -0
start-webui.bat +7 -0
start-webui.sh +6 -0
tests/test_bgm_separation.py +53 -0
tests/test_config.py +40 -0
tests/test_diarization.py +31 -0
tests/test_srt.srt +7 -0
tests/test_transcription.py +110 -0
tests/test_translation.py +56 -0
tests/test_vad.py +26 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,10 @@

+# from .gitignore
+modules/yt_tmp.wav
+**/venv/
+**/__pycache__/
+**/outputs/
+**/models/
+**/.idea
+**/.git
+**/.github

.gitignore ADDED Viewed

	@@ -0,0 +1,14 @@

+*.wav
+*.png
+*.mp4
+*.mp3
+.idea/
+.pytest_cache/
+venv/
+modules/ui/__pycache__/
+outputs/
+modules/__pycache__/
+models/
+modules/yt_tmp.wav
+configs/default_parameters.yaml
+__pycache__/

.no_build ADDED Viewed

File without changes

Dockerfile ADDED Viewed

	@@ -0,0 +1,34 @@

+FROM debian:bookworm-slim AS builder
+RUN apt-get update && \
+    apt-get install -y curl git python3 python3-pip python3-venv && \
+    rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/* && \
+    mkdir -p /Whisper-WebUI
+WORKDIR /Whisper-WebUI
+COPY requirements.txt .
+RUN python3 -m venv venv && \
+    . venv/bin/activate && \
+    pip install --no-cache-dir -r requirements.txt
+FROM debian:bookworm-slim AS runtime
+RUN apt-get update && \
+    apt-get install -y curl ffmpeg python3 && \
+    rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*
+WORKDIR /Whisper-WebUI
+COPY . .
+COPY --from=builder /Whisper-WebUI/venv /Whisper-WebUI/venv
+VOLUME [ "/Whisper-WebUI/models" ]
+VOLUME [ "/Whisper-WebUI/outputs" ]
+ENV PATH="/Whisper-WebUI/venv/bin:$PATH"
+ENV LD_LIBRARY_PATH=/Whisper-WebUI/venv/lib64/python3.11/site-packages/nvidia/cublas/lib:/Whisper-WebUI/venv/lib64/python3.11/site-packages/nvidia/cudnn/lib
+ENTRYPOINT [ "python", "app.py" ]

Install.bat ADDED Viewed

	@@ -0,0 +1,21 @@

+@echo off
+if not exist "%~dp0\venv\Scripts" (
+    echo Creating venv...
+    python -m venv venv
+)
+echo checked the venv folder. now installing requirements..
+call "%~dp0\venv\scripts\activate"
+python -m pip install -U pip
+pip install -r requirements.txt
+if errorlevel 1 (
+    echo.
+    echo Requirements installation failed. please remove venv folder and run install.bat again.
+) else (
+    echo.
+    echo Requirements installed successfully.
+)
+pause

Install.sh ADDED Viewed

	@@ -0,0 +1,18 @@

+#!/bin/bash
+if [ ! -d "venv" ]; then
+    echo "Creating virtual environment..."
+    python -m venv venv
+fi
+source venv/bin/activate
+python -m pip install -U pip
+pip install -r requirements.txt && echo "Requirements installed successfully." || {
+    echo ""
+    echo "Requirements installation failed. Please remove the venv folder and run the script again."
+    deactivate
+    exit 1
+}
+deactivate

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright 2023 jhj0517
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md ADDED Viewed

	@@ -0,0 +1,130 @@

+---
+title: Whisper-WebUI
+emoji: 🚀
+colorFrom: green
+colorTo: blue
+sdk: gradio
+sdk_version: 4.37.2
+app_file: app.py
+pinned: false
+license: apache-2.0
+---
+# Whisper-WebUI
+A Gradio-based browser interface for [Whisper](https://github.com/openai/whisper). You can use it as an Easy Subtitle Generator!
+![Whisper WebUI](https://github.com/jhj0517/Whsiper-WebUI/blob/master/screenshot.png)
+## Notebook
+If you wish to try this on Colab, you can do it in [here](https://colab.research.google.com/github/jhj0517/Whisper-WebUI/blob/master/notebook/whisper-webui.ipynb)!
+# Feature
+- Select the Whisper implementation you want to use between :
+   - [openai/whisper](https://github.com/openai/whisper)
+   - [SYSTRAN/faster-whisper](https://github.com/SYSTRAN/faster-whisper) (used by default)
+   - [insanely-fast-whisper](https://github.com/Vaibhavs10/insanely-fast-whisper)
+- Generate subtitles from various sources, including :
+  - Files
+  - Youtube
+  - Microphone
+- Currently supported subtitle formats :
+  - SRT
+  - WebVTT
+  - txt ( only text file without timeline )
+- Speech to Text Translation
+  - From other languages to English. ( This is Whisper's end-to-end speech-to-text translation feature )
+- Text to Text Translation
+  - Translate subtitle files using Facebook NLLB models
+  - Translate subtitle files using DeepL API
+- Pre-processing audio input with [Silero VAD](https://github.com/snakers4/silero-vad).
+- Post-processing with speaker diarization using the [pyannote](https://huggingface.co/pyannote/speaker-diarization-3.1) model.
+   - To download the pyannote model, you need to have a Huggingface token and manually accept their terms in the pages below.
+      1. https://huggingface.co/pyannote/speaker-diarization-3.1
+      2. https://huggingface.co/pyannote/segmentation-3.0
+# Installation and Running
+### Prerequisite
+To run this WebUI, you need to have `git`, `python` version 3.8 ~ 3.10, `FFmpeg` and `CUDA` (if you use NVIDIA GPU) version above 12.0
+Please follow the links below to install the necessary software:
+- git : [https://git-scm.com/downloads](https://git-scm.com/downloads)
+- python : [https://www.python.org/downloads/](https://www.python.org/downloads/) **( If your python version is too new, torch will not install properly.)**
+- FFmpeg :  [https://ffmpeg.org/download.html](https://ffmpeg.org/download.html)
+- CUDA : [https://developer.nvidia.com/cuda-downloads](https://developer.nvidia.com/cuda-downloads)
+After installing FFmpeg, **make sure to add the `FFmpeg/bin` folder to your system PATH!**
+### Automatic Installation
+1. Download `Whisper-WebUI.zip` with the file corresponding to your OS from [v1.0.0](https://github.com/jhj0517/Whisper-WebUI/releases/tag/v1.0.0) and extract its contents.
+2. Run `install.bat` or `install.sh` to install dependencies. (This will create a `venv` directory and install dependencies there.)
+3. Start WebUI with `start-webui.bat` or `start-webui.sh`
+4. To update the WebUI, run `update.bat` or `update.sh`
+And you can also run the project with command line arguments if you like by running `start-webui.bat`, see [wiki](https://github.com/jhj0517/Whisper-WebUI/wiki/Command-Line-Arguments) for a guide to arguments.
+- ## Running with Docker
+1. Build the image
+```sh
+docker build -t whisper-webui:latest .
+```
+2. Run the container with commands
+- For bash :
+```sh
+docker run --gpus all -d \
+-v /path/to/models:/Whisper-WebUI/models \
+-v /path/to/outputs:/Whisper-WebUI/outputs \
+-p 7860:7860 \
+-it \
+whisper-webui:latest --server_name 0.0.0.0 --server_port 7860
+```
+- For PowerShell:
+```shell
+docker run --gpus all -d `
+-v /path/to/models:/Whisper-WebUI/models `
+-v /path/to/outputs:/Whisper-WebUI/outputs `
+-p 7860:7860 `
+-it `
+whisper-webui:latest --server_name 0.0.0.0 --server_port 7860
+```
+# VRAM Usages
+This project is integrated with [faster-whisper](https://github.com/guillaumekln/faster-whisper) by default for better VRAM usage and transcription speed.
+According to faster-whisper, the efficiency of the optimized whisper model is as follows:
+| Implementation    | Precision | Beam size | Time  | Max. GPU memory | Max. CPU memory |
+|-------------------|-----------|-----------|-------|-----------------|-----------------|
+| openai/whisper    | fp16      | 5         | 4m30s | 11325MB         | 9439MB          |
+| faster-whisper    | fp16      | 5         | 54s   | 4755MB          | 3244MB          |
+If you want to use an implementation other than faster-whisper, use `--whisper_type` arg and the repository name.<br>
+Read [wiki](https://github.com/jhj0517/Whisper-WebUI/wiki/Command-Line-Arguments) for more info about CLI args.
+## Available models
+This is Whisper's original VRAM usage table for models.
+|  Size  | Parameters | English-only model | Multilingual model | Required VRAM | Relative speed |
+|:------:|:----------:|:------------------:|:------------------:|:-------------:|:--------------:|
+|  tiny  |    39 M    |     `tiny.en`      |       `tiny`       |     ~1 GB     |      ~32x      |
+|  base  |    74 M    |     `base.en`      |       `base`       |     ~1 GB     |      ~16x      |
+| small  |   244 M    |     `small.en`     |      `small`       |     ~2 GB     |      ~6x       |
+| medium |   769 M    |    `medium.en`     |      `medium`      |     ~5 GB     |      ~2x       |
+| large  |   1550 M   |        N/A         |      `large`       |    ~10 GB     |       1x       |
+`.en` models are for English only, and the cool thing is that you can use the `Translate to English` option from the "large" models!
+## TODO🗓
+- [x] Add DeepL API translation
+- [x] Add NLLB Model translation
+- [x] Integrate with faster-whisper
+- [x] Integrate with insanely-fast-whisper
+- [x] Integrate with whisperX ( Only speaker diarization part )
+- [ ] Add fast api script

app.py ADDED Viewed

	@@ -0,0 +1,352 @@

+import os
+import argparse
+import gradio as gr
+from gradio_i18n import Translate, gettext as _
+import yaml
+from modules.utils.paths import (FASTER_WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR, WHISPER_MODELS_DIR,
+                                 INSANELY_FAST_WHISPER_MODELS_DIR, NLLB_MODELS_DIR, DEFAULT_PARAMETERS_CONFIG_PATH,
+                                 UVR_MODELS_DIR, I18N_YAML_PATH)
+from modules.utils.files_manager import load_yaml
+from modules.whisper.whisper_factory import WhisperFactory
+from modules.translation.nllb_inference import NLLBInference
+from modules.ui.htmls import *
+from modules.utils.cli_manager import str2bool
+from modules.utils.youtube_manager import get_ytmetas
+from modules.translation.deepl_api import DeepLAPI
+from modules.whisper.data_classes import *
+class App:
+    def __init__(self, args):
+        self.args = args
+        self.app = gr.Blocks(css=CSS, theme=self.args.theme, delete_cache=(60, 3600))
+        self.i18n = Translate(I18N_YAML_PATH)
+        self.whisper_inf = WhisperFactory.create_whisper_inference(
+            whisper_type=self.args.whisper_type,
+            whisper_model_dir=self.args.whisper_model_dir,
+            faster_whisper_model_dir=self.args.faster_whisper_model_dir,
+            insanely_fast_whisper_model_dir=self.args.insanely_fast_whisper_model_dir,
+            uvr_model_dir=self.args.uvr_model_dir,
+            output_dir=self.args.output_dir,
+        )
+        self.nllb_inf = NLLBInference(
+            model_dir=self.args.nllb_model_dir,
+            output_dir=os.path.join(self.args.output_dir, "translations")
+        )
+        self.deepl_api = DeepLAPI(
+            output_dir=os.path.join(self.args.output_dir, "translations")
+        )
+        self.default_params = load_yaml(DEFAULT_PARAMETERS_CONFIG_PATH)
+        print(f"Use \"{self.args.whisper_type}\" implementation\n"
+              f"Device \"{self.whisper_inf.device}\" is detected")
+    def create_pipeline_inputs(self):
+        whisper_params = self.default_params["whisper"]
+        vad_params = self.default_params["vad"]
+        diarization_params = self.default_params["diarization"]
+        uvr_params = self.default_params["bgm_separation"]
+        with gr.Row():
+            dd_model = gr.Dropdown(choices=self.whisper_inf.available_models, value=whisper_params["model_size"],
+                                   label=_("Model"))
+            dd_lang = gr.Dropdown(choices=self.whisper_inf.available_langs + [AUTOMATIC_DETECTION],
+                                  value=AUTOMATIC_DETECTION if whisper_params["lang"] == AUTOMATIC_DETECTION.unwrap()
+                                  else whisper_params["lang"], label=_("Language"))
+            dd_file_format = gr.Dropdown(choices=["SRT", "WebVTT", "txt", "LRC"], value=whisper_params["file_format"], label=_("File Format"))
+        with gr.Row():
+            cb_translate = gr.Checkbox(value=whisper_params["is_translate"], label=_("Translate to English?"),
+                                       interactive=True)
+        with gr.Row():
+            cb_timestamp = gr.Checkbox(value=whisper_params["add_timestamp"],
+                                       label=_("Add a timestamp to the end of the filename"),
+                                       interactive=True)
+        with gr.Accordion(_("Advanced Parameters"), open=False):
+            whisper_inputs = WhisperParams.to_gradio_inputs(defaults=whisper_params, only_advanced=True,
+                                                            whisper_type=self.args.whisper_type,
+                                                            available_compute_types=self.whisper_inf.available_compute_types,
+                                                            compute_type=self.whisper_inf.current_compute_type)
+        with gr.Accordion(_("Background Music Remover Filter"), open=False):
+            uvr_inputs = BGMSeparationParams.to_gradio_input(defaults=uvr_params,
+                                                             available_models=self.whisper_inf.music_separator.available_models,
+                                                             available_devices=self.whisper_inf.music_separator.available_devices,
+                                                             device=self.whisper_inf.music_separator.device)
+        with gr.Accordion(_("Voice Detection Filter"), open=False):
+            vad_inputs = VadParams.to_gradio_inputs(defaults=vad_params)
+        with gr.Accordion(_("Diarization"), open=False):
+            diarization_inputs = DiarizationParams.to_gradio_inputs(defaults=diarization_params,
+                                                                    available_devices=self.whisper_inf.diarizer.available_device,
+                                                                    device=self.whisper_inf.diarizer.device)
+        dd_model.change(fn=self.on_change_models, inputs=[dd_model], outputs=[cb_translate])
+        pipeline_inputs = [dd_model, dd_lang, cb_translate] + whisper_inputs + vad_inputs + diarization_inputs + uvr_inputs
+        return (
+            pipeline_inputs,
+            dd_file_format,
+            cb_timestamp
+        )
+    def launch(self):
+        translation_params = self.default_params["translation"]
+        deepl_params = translation_params["deepl"]
+        nllb_params = translation_params["nllb"]
+        uvr_params = self.default_params["bgm_separation"]
+        with self.app:
+            with self.i18n:
+                with gr.Row():
+                    with gr.Column():
+                        gr.Markdown(MARKDOWN, elem_id="md_project")
+                with gr.Tabs():
+                    with gr.TabItem(_("File")):  # tab1
+                        with gr.Column():
+                            input_file = gr.Files(type="filepath", label=_("Upload File here"))
+                            tb_input_folder = gr.Textbox(label="Input Folder Path (Optional)",
+                                                         info="Optional: Specify the folder path where the input files are located, if you prefer to use local files instead of uploading them."
+                                                              " Leave this field empty if you do not wish to use a local path.",
+                                                         visible=self.args.colab,
+                                                         value="")
+                        pipeline_params, dd_file_format, cb_timestamp = self.create_pipeline_inputs()
+                        with gr.Row():
+                            btn_run = gr.Button(_("GENERATE SUBTITLE FILE"), variant="primary")
+                        with gr.Row():
+                            tb_indicator = gr.Textbox(label=_("Output"), scale=5)
+                            files_subtitles = gr.Files(label=_("Downloadable output file"), scale=3, interactive=False)
+                            btn_openfolder = gr.Button('📂', scale=1)
+                        params = [input_file, tb_input_folder, dd_file_format, cb_timestamp]
+                        btn_run.click(fn=self.whisper_inf.transcribe_file,
+                                      inputs=params + pipeline_params,
+                                      outputs=[tb_indicator, files_subtitles])
+                        btn_openfolder.click(fn=lambda: self.open_folder("outputs"), inputs=None, outputs=None)
+                    with gr.TabItem(_("Youtube")):  # tab2
+                        with gr.Row():
+                            tb_youtubelink = gr.Textbox(label=_("Youtube Link"))
+                        with gr.Row(equal_height=True):
+                            with gr.Column():
+                                img_thumbnail = gr.Image(label=_("Youtube Thumbnail"))
+                            with gr.Column():
+                                tb_title = gr.Label(label=_("Youtube Title"))
+                                tb_description = gr.Textbox(label=_("Youtube Description"), max_lines=15)
+                        pipeline_params, dd_file_format, cb_timestamp = self.create_pipeline_inputs()
+                        with gr.Row():
+                            btn_run = gr.Button(_("GENERATE SUBTITLE FILE"), variant="primary")
+                        with gr.Row():
+                            tb_indicator = gr.Textbox(label=_("Output"), scale=5)
+                            files_subtitles = gr.Files(label=_("Downloadable output file"), scale=3)
+                            btn_openfolder = gr.Button('📂', scale=1)
+                        params = [tb_youtubelink, dd_file_format, cb_timestamp]
+                        btn_run.click(fn=self.whisper_inf.transcribe_youtube,
+                                      inputs=params + pipeline_params,
+                                      outputs=[tb_indicator, files_subtitles])
+                        tb_youtubelink.change(get_ytmetas, inputs=[tb_youtubelink],
+                                              outputs=[img_thumbnail, tb_title, tb_description])
+                        btn_openfolder.click(fn=lambda: self.open_folder("outputs"), inputs=None, outputs=None)
+                    with gr.TabItem(_("Mic")):  # tab3
+                        with gr.Row():
+                            mic_input = gr.Microphone(label=_("Record with Mic"), type="filepath", interactive=True)
+                        pipeline_params, dd_file_format, cb_timestamp = self.create_pipeline_inputs()
+                        with gr.Row():
+                            btn_run = gr.Button(_("GENERATE SUBTITLE FILE"), variant="primary")
+                        with gr.Row():
+                            tb_indicator = gr.Textbox(label=_("Output"), scale=5)
+                            files_subtitles = gr.Files(label=_("Downloadable output file"), scale=3)
+                            btn_openfolder = gr.Button('📂', scale=1)
+                        params = [mic_input, dd_file_format, cb_timestamp]
+                        btn_run.click(fn=self.whisper_inf.transcribe_mic,
+                                      inputs=params + pipeline_params,
+                                      outputs=[tb_indicator, files_subtitles])
+                        btn_openfolder.click(fn=lambda: self.open_folder("outputs"), inputs=None, outputs=None)
+                    with gr.TabItem(_("T2T Translation")):  # tab 4
+                        with gr.Row():
+                            file_subs = gr.Files(type="filepath", label=_("Upload Subtitle Files to translate here"))
+                        with gr.TabItem(_("DeepL API")):  # sub tab1
+                            with gr.Row():
+                                tb_api_key = gr.Textbox(label=_("Your Auth Key (API KEY)"),
+                                                        value=deepl_params["api_key"])
+                            with gr.Row():
+                                dd_source_lang = gr.Dropdown(label=_("Source Language"),
+                                                             value=AUTOMATIC_DETECTION if deepl_params["source_lang"] == AUTOMATIC_DETECTION.unwrap()
+                                                             else deepl_params["source_lang"],
+                                                             choices=list(self.deepl_api.available_source_langs.keys()))
+                                dd_target_lang = gr.Dropdown(label=_("Target Language"),
+                                                             value=deepl_params["target_lang"],
+                                                             choices=list(self.deepl_api.available_target_langs.keys()))
+                            with gr.Row():
+                                cb_is_pro = gr.Checkbox(label=_("Pro User?"), value=deepl_params["is_pro"])
+                            with gr.Row():
+                                cb_timestamp = gr.Checkbox(value=translation_params["add_timestamp"],
+                                                           label=_("Add a timestamp to the end of the filename"),
+                                                           interactive=True)
+                            with gr.Row():
+                                btn_run = gr.Button(_("TRANSLATE SUBTITLE FILE"), variant="primary")
+                            with gr.Row():
+                                tb_indicator = gr.Textbox(label=_("Output"), scale=5)
+                                files_subtitles = gr.Files(label=_("Downloadable output file"), scale=3)
+                                btn_openfolder = gr.Button('📂', scale=1)
+                        btn_run.click(fn=self.deepl_api.translate_deepl,
+                                      inputs=[tb_api_key, file_subs, dd_source_lang, dd_target_lang,
+                                              cb_is_pro, cb_timestamp],
+                                      outputs=[tb_indicator, files_subtitles])
+                        btn_openfolder.click(
+                            fn=lambda: self.open_folder(os.path.join(self.args.output_dir, "translations")),
+                            inputs=None,
+                            outputs=None)
+                        with gr.TabItem(_("NLLB")):  # sub tab2
+                            with gr.Row():
+                                dd_model_size = gr.Dropdown(label=_("Model"), value=nllb_params["model_size"],
+                                                            choices=self.nllb_inf.available_models)
+                                dd_source_lang = gr.Dropdown(label=_("Source Language"),
+                                                             value=nllb_params["source_lang"],
+                                                             choices=self.nllb_inf.available_source_langs)
+                                dd_target_lang = gr.Dropdown(label=_("Target Language"),
+                                                             value=nllb_params["target_lang"],
+                                                             choices=self.nllb_inf.available_target_langs)
+                            with gr.Row():
+                                nb_max_length = gr.Number(label="Max Length Per Line", value=nllb_params["max_length"],
+                                                          precision=0)
+                            with gr.Row():
+                                cb_timestamp = gr.Checkbox(value=translation_params["add_timestamp"],
+                                                           label=_("Add a timestamp to the end of the filename"),
+                                                           interactive=True)
+                            with gr.Row():
+                                btn_run = gr.Button(_("TRANSLATE SUBTITLE FILE"), variant="primary")
+                            with gr.Row():
+                                tb_indicator = gr.Textbox(label=_("Output"), scale=5)
+                                files_subtitles = gr.Files(label=_("Downloadable output file"), scale=3)
+                                btn_openfolder = gr.Button('📂', scale=1)
+                            with gr.Column():
+                                md_vram_table = gr.HTML(NLLB_VRAM_TABLE, elem_id="md_nllb_vram_table")
+                        btn_run.click(fn=self.nllb_inf.translate_file,
+                                      inputs=[file_subs, dd_model_size, dd_source_lang, dd_target_lang,
+                                              nb_max_length, cb_timestamp],
+                                      outputs=[tb_indicator, files_subtitles])
+                        btn_openfolder.click(
+                            fn=lambda: self.open_folder(os.path.join(self.args.output_dir, "translations")),
+                            inputs=None,
+                            outputs=None)
+                    with gr.TabItem(_("BGM Separation")):
+                        files_audio = gr.Files(type="filepath", label=_("Upload Audio Files to separate background music"))
+                        dd_uvr_device = gr.Dropdown(label=_("Device"), value=self.whisper_inf.music_separator.device,
+                                                    choices=self.whisper_inf.music_separator.available_devices)
+                        dd_uvr_model_size = gr.Dropdown(label=_("Model"), value=uvr_params["model_size"],
+                                                        choices=self.whisper_inf.music_separator.available_models)
+                        nb_uvr_segment_size = gr.Number(label="Segment Size", value=uvr_params["segment_size"],
+                                                        precision=0)
+                        cb_uvr_save_file = gr.Checkbox(label=_("Save separated files to output"),
+                                                       value=True, visible=False)
+                        btn_run = gr.Button(_("SEPARATE BACKGROUND MUSIC"), variant="primary")
+                        with gr.Column():
+                            with gr.Row():
+                                ad_instrumental = gr.Audio(label=_("Instrumental"), scale=8)
+                                btn_open_instrumental_folder = gr.Button('📂', scale=1)
+                            with gr.Row():
+                                ad_vocals = gr.Audio(label=_("Vocals"), scale=8)
+                                btn_open_vocals_folder = gr.Button('📂', scale=1)
+                        btn_run.click(fn=self.whisper_inf.music_separator.separate_files,
+                                      inputs=[files_audio, dd_uvr_model_size, dd_uvr_device, nb_uvr_segment_size,
+                                              cb_uvr_save_file],
+                                      outputs=[ad_instrumental, ad_vocals])
+                        btn_open_instrumental_folder.click(inputs=None,
+                                                           outputs=None,
+                                                           fn=lambda: self.open_folder(os.path.join(
+                                                               self.args.output_dir, "UVR", "instrumental"
+                                                           )))
+                        btn_open_vocals_folder.click(inputs=None,
+                                                     outputs=None,
+                                                     fn=lambda: self.open_folder(os.path.join(
+                                                         self.args.output_dir, "UVR", "vocals"
+                                                     )))
+        # Launch the app with optional gradio settings
+        args = self.args
+        self.app.queue(
+            api_open=args.api_open
+        ).launch(
+            share=args.share,
+            server_name=args.server_name,
+            server_port=args.server_port,
+            auth=(args.username, args.password) if args.username and args.password else None,
+            root_path=args.root_path,
+            inbrowser=args.inbrowser
+        )
+    @staticmethod
+    def open_folder(folder_path: str):
+        if os.path.exists(folder_path):
+            os.system(f"start {folder_path}")
+        else:
+            os.makedirs(folder_path, exist_ok=True)
+            print(f"The directory path {folder_path} has newly created.")
+    @staticmethod
+    def on_change_models(model_size: str):
+        translatable_model = ["large", "large-v1", "large-v2", "large-v3"]
+        if model_size not in translatable_model:
+            return gr.Checkbox(visible=False, value=False, interactive=False)
+        else:
+            return gr.Checkbox(visible=True, value=False, label="Translate to English?", interactive=True)
+parser = argparse.ArgumentParser()
+parser.add_argument('--whisper_type', type=str, default=WhisperImpl.FASTER_WHISPER.value,
+                    choices=[item.value for item in WhisperImpl],
+                    help='A type of the whisper implementation (Github repo name)')
+parser.add_argument('--share', type=str2bool, default=False, nargs='?', const=True, help='Gradio share value')
+parser.add_argument('--server_name', type=str, default=None, help='Gradio server host')
+parser.add_argument('--server_port', type=int, default=None, help='Gradio server port')
+parser.add_argument('--root_path', type=str, default=None, help='Gradio root path')
+parser.add_argument('--username', type=str, default=None, help='Gradio authentication username')
+parser.add_argument('--password', type=str, default=None, help='Gradio authentication password')
+parser.add_argument('--theme', type=str, default=None, help='Gradio Blocks theme')
+parser.add_argument('--colab', type=str2bool, default=False, nargs='?', const=True, help='Is colab user or not')
+parser.add_argument('--api_open', type=str2bool, default=False, nargs='?', const=True,
+                    help='Enable api or not in Gradio')
+parser.add_argument('--inbrowser', type=str2bool, default=True, nargs='?', const=True,
+                    help='Whether to automatically start Gradio app or not')
+parser.add_argument('--whisper_model_dir', type=str, default=WHISPER_MODELS_DIR,
+                    help='Directory path of the whisper model')
+parser.add_argument('--faster_whisper_model_dir', type=str, default=FASTER_WHISPER_MODELS_DIR,
+                    help='Directory path of the faster-whisper model')
+parser.add_argument('--insanely_fast_whisper_model_dir', type=str,
+                    default=INSANELY_FAST_WHISPER_MODELS_DIR,
+                    help='Directory path of the insanely-fast-whisper model')
+parser.add_argument('--diarization_model_dir', type=str, default=DIARIZATION_MODELS_DIR,
+                    help='Directory path of the diarization model')
+parser.add_argument('--nllb_model_dir', type=str, default=NLLB_MODELS_DIR,
+                    help='Directory path of the Facebook NLLB model')
+parser.add_argument('--uvr_model_dir', type=str, default=UVR_MODELS_DIR,
+                    help='Directory path of the UVR model')
+parser.add_argument('--output_dir', type=str, default=OUTPUT_DIR, help='Directory path of the outputs')
+_args = parser.parse_args()
+if __name__ == "__main__":
+    app = App(args=_args)
+    app.launch(share=True)

configs/translation.yaml ADDED Viewed

	@@ -0,0 +1,459 @@

+en: # English
+  Language: Language
+  File: File
+  Youtube: Youtube
+  Mic: Mic
+  T2T Translation: T2T Translation
+  BGM Separation: BGM Separation
+  GENERATE SUBTITLE FILE: GENERATE SUBTITLE FILE
+  Output: Output
+  Downloadable output file: Downloadable output file
+  Upload File here: Upload File here
+  Model: Model
+  Automatic Detection: Automatic Detection
+  File Format: File Format
+  Translate to English?: Translate to English?
+  Add a timestamp to the end of the filename: Add a timestamp to the end of the filename
+  Advanced Parameters: Advanced Parameters
+  Background Music Remover Filter: Background Music Remover Filter
+  Enabling this will remove background music: Enabling this will remove background music by submodel before transcribing
+  Enable Background Music Remover Filter: Enable Background Music Remover Filter
+  Save separated files to output: Save separated files to output
+  Offload sub model after removing background music: Offload sub model after removing background music
+  Voice Detection Filter: Voice Detection Filter
+  Enable this to transcribe only detected voice: Enable this to transcribe only detected voice parts by submodel.
+  Enable Silero VAD Filter: Enable Silero VAD Filter
+  Diarization: Diarization
+  Enable Diarization: Enable Diarization
+  HuggingFace Token: HuggingFace Token
+  This is only needed the first time you download the model: This is only needed the first time you download the model. If you already have models, you don't need to enter. To download the model, you must manually go to "https://huggingface.co/pyannote/speaker-diarization-3.1" and "https://huggingface.co/pyannote/segmentation-3.0" and agree to their requirement.
+  Device: Device
+  Youtube Link: Youtube Link
+  Youtube Thumbnail: Youtube Thumbnail
+  Youtube Title: Youtube Title
+  Youtube Description: Youtube Description
+  Record with Mic: Record with Mic
+  Upload Subtitle Files to translate here: Upload Subtitle Files to translate here
+  Your Auth Key (API KEY): Your Auth Key (API KEY)
+  Source Language: Source Language
+  Target Language: Target Language
+  Pro User?: Pro User?
+  TRANSLATE SUBTITLE FILE: TRANSLATE SUBTITLE FILE
+  Upload Audio Files to separate background music: Upload Audio Files to separate background music
+  Instrumental: Instrumental
+  Vocals: Vocals
+  SEPARATE BACKGROUND MUSIC: SEPARATE BACKGROUND MUSIC
+ko: # Korean
+  Language: 언어
+  File: 파일
+  Youtube: 유튜브
+  Mic: 마이크
+  T2T Translation: T2T 자막 번역
+  BGM Separation: 배경 음악 분리
+  GENERATE SUBTITLE FILE: 자막 파일 생성
+  Output: 결과물
+  Downloadable output file: 결과물 파일 다운로드
+  Upload File here: 파일을 업로드 하세요
+  Model: 모델
+  Automatic Detection: 자동 감지
+  File Format: 파일 형식
+  Translate to English?: 영어로 번역합니까? (위스퍼 모델 자체 번역 기능)
+  Add a timestamp to the end of the filename: 파일 이름 끝에 타임스태프 붙이기
+  Advanced Parameters: 고급 변수
+  Background Music Remover Filter: 배경 음악 제거 필터
+  Enabling this will remove background music: 받아쓰기 이전에 먼저 배경 음악 제거용 서브 모델을 활성화 합니다.
+  Enable Background Music Remover Filter: 배경 음악 제거 필터 활성화
+  Save separated files to output: 분리된 배경 음악 & 음성 파일 따로 출력 폴더에 저장
+  Offload sub model after removing background music: 배경 음악 제거 후 서브 모델을 비활성화 합니다. (VRAM 이 부족할 시 체크하세요.)
+  Voice Detection Filter: 목소리 감지 필터
+  Enable this to transcribe only detected voice: 서브 모델에 의해 목소리라고 판단된 부분만 받아쓰기를 진행합니다.
+  Enable Silero VAD Filter: Silero VAD 필터 활성화
+  Diarization: 화자 구분
+  Enable Diarization: 화자 구분 활성화
+  HuggingFace Token: 허깅페이스 토큰
+  This is only needed the first time you download the model: 모델을 처음 다운받을 때만 토큰이 필요합니다. 이미 다운로드 받으신 상태라면 입력하지 않아도 됩니다. 모델을 다운 받기 위해선 "https://huggingface.co/pyannote/speaker-diarization-3.1" 와 "https://huggingface.co/pyannote/segmentation-3.0" 에서 먼저 사용 지침에 동의하셔야 합니다.
+  Device: 디바이스
+  Youtube Link: 유튜브 링크
+  Youtube Thumbnail: 유튜브 썸네일
+  Youtube Title: 유튜브 제목
+  Youtube Description: 유튜브 설명
+  Record with Mic: 마이크로 녹음하세요
+  Upload Subtitle Files to translate here: 번역할 자막 파일을 업로드 하세요
+  Your Auth Key (API KEY): DeepL API 키
+  Source Language: 원본 언어
+  Target Language: 대상 언어
+  Pro User?: Pro 버전 사용자
+  TRANSLATE SUBTITLE FILE: 자막 파일 번역
+  Upload Audio Files to separate background music: 배경 음악을 분리할 오디오 파일을 업로드 하세요
+  Instrumental: 악기
+  Vocals: 보컬
+  SEPARATE BACKGROUND MUSIC: 배경 음악 분리
+ja: # Japanese
+  Language: 言語
+  File: File
+  Youtube: Youtube
+  Mic: Mic
+  T2T Translation: T2T Translation
+  BGM Separation: BGM Separation
+  GENERATE SUBTITLE FILE: GENERATE SUBTITLE FILE
+  Output: Output
+  Downloadable output file: Downloadable output file
+  Upload File here: Upload File here
+  Model: Model
+  Automatic Detection: Automatic Detection
+  File Format: File Format
+  Translate to English?: Translate to English?
+  Add a timestamp to the end of the filename: Add a timestamp to the end of the filename
+  Advanced Parameters: Advanced Parameters
+  Background Music Remover Filter: Background Music Remover Filter
+  Enabling this will remove background music: Enabling this will remove background music by submodel before transcribing
+  Enable Background Music Remover Filter: Enable Background Music Remover Filter
+  Save separated files to output: Save separated files to output
+  Offload sub model after removing background music: Offload sub model after removing background music
+  Voice Detection Filter: Voice Detection Filter
+  Enable this to transcribe only detected voice: Enable this to transcribe only detected voice parts by submodel.
+  Enable Silero VAD Filter: Enable Silero VAD Filter
+  Diarization: Diarization
+  Enable Diarization: Enable Diarization
+  HuggingFace Token: HuggingFace Token
+  This is only needed the first time you download the model: This is only needed the first time you download the model. If you already have models, you don't need to enter. To download the model, you must manually go to "https://huggingface.co/pyannote/speaker-diarization-3.1" and "https://huggingface.co/pyannote/segmentation-3.0" and agree to their requirement.
+  Device: Device
+  Youtube Link: Youtube Link
+  Youtube Thumbnail: Youtube Thumbnail
+  Youtube Title: Youtube Title
+  Youtube Description: Youtube Description
+  Record with Mic: Record with Mic
+  Upload Subtitle Files to translate here: Upload Subtitle Files to translate here
+  Your Auth Key (API KEY): Your Auth Key (API KEY)
+  Source Language: Source Language
+  Target Language: Target Language
+  Pro User?: Pro User?
+  TRANSLATE SUBTITLE FILE: TRANSLATE SUBTITLE FILE
+  Upload Audio Files to separate background music: Upload Audio Files to separate background music
+  Instrumental: Instrumental
+  Vocals: Vocals
+  SEPARATE BACKGROUND MUSIC: SEPARATE BACKGROUND MUSIC
+es: # Spanish
+  Language: Idioma
+  File: File
+  Youtube: Youtube
+  Mic: Mic
+  T2T Translation: T2T Translation
+  BGM Separation: BGM Separation
+  GENERATE SUBTITLE FILE: GENERATE SUBTITLE FILE
+  Output: Output
+  Downloadable output file: Downloadable output file
+  Upload File here: Upload File here
+  Model: Model
+  Automatic Detection: Automatic Detection
+  File Format: File Format
+  Translate to English?: Translate to English?
+  Add a timestamp to the end of the filename: Add a timestamp to the end of the filename
+  Advanced Parameters: Advanced Parameters
+  Background Music Remover Filter: Background Music Remover Filter
+  Enabling this will remove background music: Enabling this will remove background music by submodel before transcribing
+  Enable Background Music Remover Filter: Enable Background Music Remover Filter
+  Save separated files to output: Save separated files to output
+  Offload sub model after removing background music: Offload sub model after removing background music
+  Voice Detection Filter: Voice Detection Filter
+  Enable this to transcribe only detected voice: Enable this to transcribe only detected voice parts by submodel.
+  Enable Silero VAD Filter: Enable Silero VAD Filter
+  Diarization: Diarization
+  Enable Diarization: Enable Diarization
+  HuggingFace Token: HuggingFace Token
+  This is only needed the first time you download the model: This is only needed the first time you download the model. If you already have models, you don't need to enter. To download the model, you must manually go to "https://huggingface.co/pyannote/speaker-diarization-3.1" and "https://huggingface.co/pyannote/segmentation-3.0" and agree to their requirement.
+  Device: Device
+  Youtube Link: Youtube Link
+  Youtube Thumbnail: Youtube Thumbnail
+  Youtube Title: Youtube Title
+  Youtube Description: Youtube Description
+  Record with Mic: Record with Mic
+  Upload Subtitle Files to translate here: Upload Subtitle Files to translate here
+  Your Auth Key (API KEY): Your Auth Key (API KEY)
+  Source Language: Source Language
+  Target Language: Target Language
+  Pro User?: Pro User?
+  TRANSLATE SUBTITLE FILE: TRANSLATE SUBTITLE FILE
+  Upload Audio Files to separate background music: Upload Audio Files to separate background music
+  Instrumental: Instrumental
+  Vocals: Vocals
+  SEPARATE BACKGROUND MUSIC: SEPARATE BACKGROUND MUSIC
+fr: # French
+  Language: Langue
+  File: File
+  Youtube: Youtube
+  Mic: Mic
+  T2T Translation: T2T Translation
+  BGM Separation: BGM Separation
+  GENERATE SUBTITLE FILE: GENERATE SUBTITLE FILE
+  Output: Output
+  Downloadable output file: Downloadable output file
+  Upload File here: Upload File here
+  Model: Model
+  Automatic Detection: Automatic Detection
+  File Format: File Format
+  Translate to English?: Translate to English?
+  Add a timestamp to the end of the filename: Add a timestamp to the end of the filename
+  Advanced Parameters: Advanced Parameters
+  Background Music Remover Filter: Background Music Remover Filter
+  Enabling this will remove background music: Enabling this will remove background music by submodel before transcribing
+  Enable Background Music Remover Filter: Enable Background Music Remover Filter
+  Save separated files to output: Save separated files to output
+  Offload sub model after removing background music: Offload sub model after removing background music
+  Voice Detection Filter: Voice Detection Filter
+  Enable this to transcribe only detected voice: Enable this to transcribe only detected voice parts by submodel.
+  Enable Silero VAD Filter: Enable Silero VAD Filter
+  Diarization: Diarization
+  Enable Diarization: Enable Diarization
+  HuggingFace Token: HuggingFace Token
+  This is only needed the first time you download the model: This is only needed the first time you download the model. If you already have models, you don't need to enter. To download the model, you must manually go to "https://huggingface.co/pyannote/speaker-diarization-3.1" and "https://huggingface.co/pyannote/segmentation-3.0" and agree to their requirement.
+  Device: Device
+  Youtube Link: Youtube Link
+  Youtube Thumbnail: Youtube Thumbnail
+  Youtube Title: Youtube Title
+  Youtube Description: Youtube Description
+  Record with Mic: Record with Mic
+  Upload Subtitle Files to translate here: Upload Subtitle Files to translate here
+  Your Auth Key (API KEY): Your Auth Key (API KEY)
+  Source Language: Source Language
+  Target Language: Target Language
+  Pro User?: Pro User?
+  TRANSLATE SUBTITLE FILE: TRANSLATE SUBTITLE FILE
+  Upload Audio Files to separate background music: Upload Audio Files to separate background music
+  Instrumental: Instrumental
+  Vocals: Vocals
+  SEPARATE BACKGROUND MUSIC: SEPARATE BACKGROUND MUSIC
+de: # German
+  Language: Sprache
+  File: File
+  Youtube: Youtube
+  Mic: Mic
+  T2T Translation: T2T Translation
+  BGM Separation: BGM Separation
+  GENERATE SUBTITLE FILE: GENERATE SUBTITLE FILE
+  Output: Output
+  Downloadable output file: Downloadable output file
+  Upload File here: Upload File here
+  Model: Model
+  Automatic Detection: Automatic Detection
+  File Format: File Format
+  Translate to English?: Translate to English?
+  Add a timestamp to the end of the filename: Add a timestamp to the end of the filename
+  Advanced Parameters: Advanced Parameters
+  Background Music Remover Filter: Background Music Remover Filter
+  Enabling this will remove background music: Enabling this will remove background music by submodel before transcribing
+  Enable Background Music Remover Filter: Enable Background Music Remover Filter
+  Save separated files to output: Save separated files to output
+  Offload sub model after removing background music: Offload sub model after removing background music
+  Voice Detection Filter: Voice Detection Filter
+  Enable this to transcribe only detected voice: Enable this to transcribe only detected voice parts by submodel.
+  Enable Silero VAD Filter: Enable Silero VAD Filter
+  Diarization: Diarization
+  Enable Diarization: Enable Diarization
+  HuggingFace Token: HuggingFace Token
+  This is only needed the first time you download the model: This is only needed the first time you download the model. If you already have models, you don't need to enter. To download the model, you must manually go to "https://huggingface.co/pyannote/speaker-diarization-3.1" and "https://huggingface.co/pyannote/segmentation-3.0" and agree to their requirement.
+  Device: Device
+  Youtube Link: Youtube Link
+  Youtube Thumbnail: Youtube Thumbnail
+  Youtube Title: Youtube Title
+  Youtube Description: Youtube Description
+  Record with Mic: Record with Mic
+  Upload Subtitle Files to translate here: Upload Subtitle Files to translate here
+  Your Auth Key (API KEY): Your Auth Key (API KEY)
+  Source Language: Source Language
+  Target Language: Target Language
+  Pro User?: Pro User?
+  TRANSLATE SUBTITLE FILE: TRANSLATE SUBTITLE FILE
+  Upload Audio Files to separate background music: Upload Audio Files to separate background music
+  Instrumental: Instrumental
+  Vocals: Vocals
+  SEPARATE BACKGROUND MUSIC: SEPARATE BACKGROUND MUSIC
+zh: # Chinese
+  Language: 语言
+  File: File
+  Youtube: Youtube
+  Mic: Mic
+  T2T Translation: T2T Translation
+  BGM Separation: BGM Separation
+  GENERATE SUBTITLE FILE: GENERATE SUBTITLE FILE
+  Output: Output
+  Downloadable output file: Downloadable output file
+  Upload File here: Upload File here
+  Model: Model
+  Automatic Detection: Automatic Detection
+  File Format: File Format
+  Translate to English?: Translate to English?
+  Add a timestamp to the end of the filename: Add a timestamp to the end of the filename
+  Advanced Parameters: Advanced Parameters
+  Background Music Remover Filter: Background Music Remover Filter
+  Enabling this will remove background music: Enabling this will remove background music by submodel before transcribing
+  Enable Background Music Remover Filter: Enable Background Music Remover Filter
+  Save separated files to output: Save separated files to output
+  Offload sub model after removing background music: Offload sub model after removing background music
+  Voice Detection Filter: Voice Detection Filter
+  Enable this to transcribe only detected voice: Enable this to transcribe only detected voice parts by submodel.
+  Enable Silero VAD Filter: Enable Silero VAD Filter
+  Diarization: Diarization
+  Enable Diarization: Enable Diarization
+  HuggingFace Token: HuggingFace Token
+  This is only needed the first time you download the model: This is only needed the first time you download the model. If you already have models, you don't need to enter. To download the model, you must manually go to "https://huggingface.co/pyannote/speaker-diarization-3.1" and "https://huggingface.co/pyannote/segmentation-3.0" and agree to their requirement.
+  Device: Device
+  Youtube Link: Youtube Link
+  Youtube Thumbnail: Youtube Thumbnail
+  Youtube Title: Youtube Title
+  Youtube Description: Youtube Description
+  Record with Mic: Record with Mic
+  Upload Subtitle Files to translate here: Upload Subtitle Files to translate here
+  Your Auth Key (API KEY): Your Auth Key (API KEY)
+  Source Language: Source Language
+  Target Language: Target Language
+  Pro User?: Pro User?
+  TRANSLATE SUBTITLE FILE: TRANSLATE SUBTITLE FILE
+  Upload Audio Files to separate background music: Upload Audio Files to separate background music
+  Instrumental: Instrumental
+  Vocals: Vocals
+  SEPARATE BACKGROUND MUSIC: SEPARATE BACKGROUND MUSIC
+uk: # Ukrainian
+  Language: Мова
+  File: Файл
+  Youtube: Youtube
+  Mic: Мікрофон
+  T2T Translation: T2T Переклад
+  BGM Separation: Розділення фонової музики
+  GENERATE SUBTITLE FILE: СТВОРИТИ ФАЙЛ СУБТИТРІВ
+  Output: Результат
+  Downloadable output file: Завантажуваний файл результату
+  Upload File here: Завантажте файл тут
+  Model: Модель
+  Automatic Detection: Автоматичне визначення
+  File Format: Формат файлу
+  Translate to English?: Перекласти на англійську?
+  Add a timestamp to the end of the filename: Додати мітку часу до кінця імені файлу
+  Advanced Parameters: Розширені параметри
+  Background Music Remover Filter: Фільтр видалення фонової музики
+  Enabling this will remove background music: Увімкнення цього видалить фонову музику за допомогою підмоделі перед транскрипцією
+  Enable Background Music Remover Filter: Увімкнути фільтр видалення фонової музики
+  Save separated files to output: Зберегти розділені файли до вихідної папки
+  Offload sub model after removing background music: Вивантажити підмодель після видалення фонової музики
+  Voice Detection Filter: Фільтр розпізнавання голосу
+  Enable this to transcribe only detected voice: Увімкніть це, щоб транскрибувати лише розпізнані голосові частини за допомогою підмоделі
+  Enable Silero VAD Filter: Увімкнути фільтр Silero VAD
+  Diarization: Діаризація
+  Enable Diarization: Увімкнути діаризацію
+  HuggingFace Token: Токен HuggingFace
+  This is only needed the first time you download the model: Це потрібно лише при першому завантаженні моделі. Якщо у вас вже є моделі, вводити не потрібно. Щоб завантажити модель, потрібно вручну перейти на "https://huggingface.co/pyannote/speaker-diarization-3.1" та "https://huggingface.co/pyannote/segmentation-3.0" і погодитися з їхніми вимогами.
+  Device: Пристрій
+  Youtube Link: Посилання на Youtube
+  Youtube Thumbnail: Ескіз Youtube
+  Youtube Title: Назва Youtube
+  Youtube Description: Опис Youtube
+  Record with Mic: Записати з мікрофона
+  Upload Subtitle Files to translate here: Завантажте файли субтитрів для перекладу тут
+  Your Auth Key (API KEY): Ваш ключ авторизації (API KEY)
+  Source Language: Мова джерела
+  Target Language: Мова перекладу
+  Pro User?: Професійний користувач?
+  TRANSLATE SUBTITLE FILE: ПЕРЕКЛАСТИ ФАЙЛ СУБТИТРІВ
+  Upload Audio Files to separate background music: Завантажте аудіофайли для розділення фонової музики
+  Instrumental: Інструментал
+  Vocals: Вокал
+  SEPARATE BACKGROUND MUSIC: РОЗДІЛИТИ ФОНОВУ МУЗИКУ
+ru: # Russian
+  Language: Язык
+  File: Файл
+  Youtube: Youtube
+  Mic: Микрофон
+  T2T Translation: Перевод T2T
+  BGM Separation: Разделение фоновой музыки
+  GENERATE SUBTITLE FILE: СГЕНЕРИРОВАТЬ ФАЙЛ СУБТИТРОВ
+  Output: Результат
+  Downloadable output file: Загружаемый файл результата
+  Upload File here: Загрузите файл здесь
+  Model: Модель
+  Automatic Detection: Автоматическое определение
+  File Format: Формат файла
+  Translate to English?: Перевести на английский?
+  Add a timestamp to the end of the filename: Добавить метку времени в конец имени файла
+  Advanced Parameters: Расширенные параметры
+  Background Music Remover Filter: Фильтр удаления фоновой музыки
+  Enabling this will remove background music: Включение этого удалит фоновую музыку с помощью подмодели перед транскрипцией
+  Enable Background Music Remover Filter: Включить фильтр удаления фоновой музыки
+  Save separated files to output: Сохранить разделенные файлы в выходную папку
+  Offload sub model after removing background music: Выгрузить подмодель после удаления фоновой музыки
+  Voice Detection Filter: Фильтр обнаружения голоса
+  Enable this to transcribe only detected voice: Включите это, чтобы транскрибировать только обнаруженные голосовые части с помощью подмодели
+  Enable Silero VAD Filter: Включить фильтр Silero VAD
+  Diarization: Диаризация
+  Enable Diarization: Включить диаризацию
+  HuggingFace Token: Токен HuggingFace
+  This is only needed the first time you download the model: Это нужно только при первом скачивании модели. Если у вас уже есть модели, вводить не нужно. Чтобы скачать модель, нужно вручную перейти на "https://huggingface.co/pyannote/speaker-diarization-3.1" и "https://huggingface.co/pyannote/segmentation-3.0" и согласиться с их требованиями.
+  Device: Устройство
+  Youtube Link: Ссылка на Youtube
+  Youtube Thumbnail: Миниатюра Youtube
+  Youtube Title: Название Youtube
+  Youtube Description: Описание Youtube
+  Record with Mic: Записать с микрофона
+  Upload Subtitle Files to translate here: Загрузите файлы субтитров для перевода здесь
+  Your Auth Key (API KEY): Ваш Auth Key (API KEY)
+  Source Language: Исходный язык
+  Target Language: Целевой язык
+  Pro User?: Профессиональный пользователь?
+  TRANSLATE SUBTITLE FILE: ПЕРЕВЕСТИ ФАЙЛ СУБТИТРОВ
+  Upload Audio Files to separate background music: Загрузите аудиофайлы для разделения фоновой музыки
+  Instrumental: Инструментал
+  Vocals: Вокал
+  SEPARATE BACKGROUND MUSIC: РАЗДЕЛИТЬ ФОНОВУЮ МУЗЫКУ
+tr: # Turkish
+  Language: Dil
+  File: Dosya
+  Youtube: Youtube
+  Mic: Mikrofon
+  T2T Translation: T2T Çeviri
+  BGM Separation: Arka Plan Müziği Ayırma
+  GENERATE SUBTITLE FILE: ALTYAZI DOSYASI OLUŞTUR
+  Output: Çıktı
+  Downloadable output file: İndirilebilir çıktı dosyası
+  Upload File here: Dosya Yükle
+  Model: Model
+  Automatic Detection: Otomatik Algılama
+  File Format: Dosya Formatı
+  Translate to English?: İngilizceye Çevir?
+  Add a timestamp to the end of the filename: Dosya adının sonuna zaman damgası ekle
+  Advanced Parameters: Gelişmiş Parametreler
+  Background Music Remover Filter: Arka Plan Müziği Kaldırma Filtresi
+  Enabling this will remove background music: Bunu etkinleştirmek, arka plan müziğini alt model tarafından transkripsiyondan önce kaldıracaktır
+  Enable Background Music Remover Filter: Arka Plan Müziği Kaldırma Filtresini Etkinleştir
+  Save separated files to output: Ayrılmış dosyaları çıktıya kaydet
+  Offload sub model after removing background music: Arka plan müziği kaldırıldıktan sonra alt modeli devre dışı bırak
+  Voice Detection Filter: Ses Algılama Filtresi
+  Enable this to transcribe only detected voice: Bunu etkinleştirerek yalnızca alt model tarafından algılanan ses kısımlarını transkribe et
+  Enable Silero VAD Filter: Silero VAD Filtresini Etkinleştir
+  Diarization: Konuşmacı Ayrımı
+  Enable Diarization: Konuşmacı Ayrımını Etkinleştir
+  HuggingFace Token: HuggingFace Anahtarı
+  This is only needed the first time you download the model: Bu, modeli ilk kez indirirken gereklidir. Zaten modelleriniz varsa girmenize gerek yok. Modeli indirmek için "https://huggingface.co/pyannote/speaker-diarization-3.1" ve "https://huggingface.co/pyannote/segmentation-3.0" adreslerine gidip gereksinimlerini kabul etmeniz gerekiyor
+  Device: Cihaz
+  Youtube Link: Youtube Bağlantısı
+  Youtube Thumbnail: Youtube Küçük Resmi
+  Youtube Title: Youtube Başlığı
+  Youtube Description: Youtube Açıklaması
+  Record with Mic: Mikrofonla Kaydet
+  Upload Subtitle Files to translate here: Çeviri için altyazı dosyalarını buraya yükle
+  Your Auth Key (API KEY): Yetki Anahtarınız (API ANAHTARI)
+  Source Language: Kaynak Dil
+  Target Language: Hedef Dil
+  Pro User?: Pro Kullanıcı?
+  TRANSLATE SUBTITLE FILE: ALTYAZI DOSYASINI ÇEVİR
+  Upload Audio Files to separate background music: Arka plan müziğini ayırmak için ses dosyalarını yükle
+  Instrumental: Enstrümantal
+  Vocals: Vokal
+  SEPARATE BACKGROUND MUSIC: ARKA PLAN MÜZİĞİNİ AYIR

docker-compose.yaml ADDED Viewed

	@@ -0,0 +1,29 @@

+services:
+  app:
+    build: .
+    image: jhj0517/whisper-webui:latest
+    volumes:
+      # Update paths to mount models and output paths to your custom paths like this, e.g:
+      # - C:/whisper-models/custom-path:/Whisper-WebUI/models
+      # - C:/whisper-webui-outputs/custom-path:/Whisper-WebUI/outputs
+      - /Whisper-WebUI/models
+      - /Whisper-WebUI/outputs
+    ports:
+      - "7860:7860"
+    stdin_open: true
+    tty: true
+    entrypoint: ["python", "app.py", "--server_port", "7860", "--server_name", "0.0.0.0",]
+    # If you're not using nvidia GPU, Update device to match yours.
+    # See more info at : https://docs.docker.com/compose/compose-file/deploy/#driver
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [ gpu ]

modules/__init__.py ADDED Viewed

File without changes

modules/diarize/__init__.py ADDED Viewed

File without changes

modules/diarize/audio_loader.py ADDED Viewed

	@@ -0,0 +1,179 @@

+# Adapted from https://github.com/m-bain/whisperX/blob/main/whisperx/audio.py
+import os
+import subprocess
+from functools import lru_cache
+from typing import Optional, Union
+from scipy.io.wavfile import write
+import tempfile
+import numpy as np
+import torch
+import torch.nn.functional as F
+def exact_div(x, y):
+    assert x % y == 0
+    return x // y
+# hard-coded audio hyperparameters
+SAMPLE_RATE = 16000
+N_FFT = 400
+HOP_LENGTH = 160
+CHUNK_LENGTH = 30
+N_SAMPLES = CHUNK_LENGTH * SAMPLE_RATE  # 480000 samples in a 30-second chunk
+N_FRAMES = exact_div(N_SAMPLES, HOP_LENGTH)  # 3000 frames in a mel spectrogram input
+N_SAMPLES_PER_TOKEN = HOP_LENGTH * 2  # the initial convolutions has stride 2
+FRAMES_PER_SECOND = exact_div(SAMPLE_RATE, HOP_LENGTH)  # 10ms per audio frame
+TOKENS_PER_SECOND = exact_div(SAMPLE_RATE, N_SAMPLES_PER_TOKEN)  # 20ms per audio token
+def load_audio(file: Union[str, np.ndarray], sr: int = SAMPLE_RATE) -> np.ndarray:
+    """
+    Open an audio file or process a numpy array containing audio data as mono waveform, resampling as necessary.
+    Parameters
+    ----------
+    file: Union[str, np.ndarray]
+        The audio file to open or a numpy array containing the audio data.
+    sr: int
+        The sample rate to resample the audio if necessary.
+    Returns
+    -------
+    A NumPy array containing the audio waveform, in float32 dtype.
+    """
+    if isinstance(file, np.ndarray):
+        if file.dtype != np.float32:
+            file = file.astype(np.float32)
+        if file.ndim > 1:
+            file = np.mean(file, axis=1)
+        temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
+        write(temp_file.name, SAMPLE_RATE, (file * 32768).astype(np.int16))
+        temp_file_path = temp_file.name
+        temp_file.close()
+    else:
+        temp_file_path = file
+    try:
+        cmd = [
+            "ffmpeg",
+            "-nostdin",
+            "-threads",
+            "0",
+            "-i",
+            temp_file_path,
+            "-f",
+            "s16le",
+            "-ac",
+            "1",
+            "-acodec",
+            "pcm_s16le",
+            "-ar",
+            str(sr),
+            "-",
+        ]
+        out = subprocess.run(cmd, capture_output=True, check=True).stdout
+    except subprocess.CalledProcessError as e:
+        raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
+    finally:
+        if isinstance(file, np.ndarray):
+            os.remove(temp_file_path)
+    return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0
+def pad_or_trim(array, length: int = N_SAMPLES, *, axis: int = -1):
+    """
+    Pad or trim the audio array to N_SAMPLES, as expected by the encoder.
+    """
+    if torch.is_tensor(array):
+        if array.shape[axis] > length:
+            array = array.index_select(
+                dim=axis, index=torch.arange(length, device=array.device)
+            )
+        if array.shape[axis] < length:
+            pad_widths = [(0, 0)] * array.ndim
+            pad_widths[axis] = (0, length - array.shape[axis])
+            array = F.pad(array, [pad for sizes in pad_widths[::-1] for pad in sizes])
+    else:
+        if array.shape[axis] > length:
+            array = array.take(indices=range(length), axis=axis)
+        if array.shape[axis] < length:
+            pad_widths = [(0, 0)] * array.ndim
+            pad_widths[axis] = (0, length - array.shape[axis])
+            array = np.pad(array, pad_widths)
+    return array
+@lru_cache(maxsize=None)
+def mel_filters(device, n_mels: int) -> torch.Tensor:
+    """
+    load the mel filterbank matrix for projecting STFT into a Mel spectrogram.
+    Allows decoupling librosa dependency; saved using:
+        np.savez_compressed(
+            "mel_filters.npz",
+            mel_80=librosa.filters.mel(sr=16000, n_fft=400, n_mels=80),
+        )
+    """
+    assert n_mels in [80, 128], f"Unsupported n_mels: {n_mels}"
+    with np.load(
+        os.path.join(os.path.dirname(__file__), "assets", "mel_filters.npz")
+    ) as f:
+        return torch.from_numpy(f[f"mel_{n_mels}"]).to(device)
+def log_mel_spectrogram(
+    audio: Union[str, np.ndarray, torch.Tensor],
+    n_mels: int,
+    padding: int = 0,
+    device: Optional[Union[str, torch.device]] = None,
+):
+    """
+    Compute the log-Mel spectrogram of
+    Parameters
+    ----------
+    audio: Union[str, np.ndarray, torch.Tensor], shape = (*)
+        The path to audio or either a NumPy array or Tensor containing the audio waveform in 16 kHz
+    n_mels: int
+        The number of Mel-frequency filters, only 80 is supported
+    padding: int
+        Number of zero samples to pad to the right
+    device: Optional[Union[str, torch.device]]
+        If given, the audio tensor is moved to this device before STFT
+    Returns
+    -------
+    torch.Tensor, shape = (80, n_frames)
+        A Tensor that contains the Mel spectrogram
+    """
+    if not torch.is_tensor(audio):
+        if isinstance(audio, str):
+            audio = load_audio(audio)
+        audio = torch.from_numpy(audio)
+    if device is not None:
+        audio = audio.to(device)
+    if padding > 0:
+        audio = F.pad(audio, (0, padding))
+    window = torch.hann_window(N_FFT).to(audio.device)
+    stft = torch.stft(audio, N_FFT, HOP_LENGTH, window=window, return_complex=True)
+    magnitudes = stft[..., :-1].abs() ** 2
+    filters = mel_filters(audio.device, n_mels)
+    mel_spec = filters @ magnitudes
+    log_spec = torch.clamp(mel_spec, min=1e-10).log10()
+    log_spec = torch.maximum(log_spec, log_spec.max() - 8.0)
+    log_spec = (log_spec + 4.0) / 4.0
+    return log_spec

modules/diarize/diarize_pipeline.py ADDED Viewed

	@@ -0,0 +1,98 @@

+# Adapted from https://github.com/m-bain/whisperX/blob/main/whisperx/diarize.py
+import numpy as np
+import pandas as pd
+import os
+from pyannote.audio import Pipeline
+from typing import Optional, Union
+import torch
+from modules.whisper.data_classes import *
+from modules.utils.paths import DIARIZATION_MODELS_DIR
+from modules.diarize.audio_loader import load_audio, SAMPLE_RATE
+class DiarizationPipeline:
+    def __init__(
+        self,
+        model_name="pyannote/speaker-diarization-3.1",
+        cache_dir: str = DIARIZATION_MODELS_DIR,
+        use_auth_token=None,
+        device: Optional[Union[str, torch.device]] = "cpu",
+    ):
+        if isinstance(device, str):
+            device = torch.device(device)
+        self.model = Pipeline.from_pretrained(
+            model_name,
+            use_auth_token=use_auth_token,
+            cache_dir=cache_dir
+        ).to(device)
+    def __call__(self, audio: Union[str, np.ndarray], min_speakers=None, max_speakers=None):
+        if isinstance(audio, str):
+            audio = load_audio(audio)
+        audio_data = {
+            'waveform': torch.from_numpy(audio[None, :]),
+            'sample_rate': SAMPLE_RATE
+        }
+        segments = self.model(audio_data, min_speakers=min_speakers, max_speakers=max_speakers)
+        diarize_df = pd.DataFrame(segments.itertracks(yield_label=True), columns=['segment', 'label', 'speaker'])
+        diarize_df['start'] = diarize_df['segment'].apply(lambda x: x.start)
+        diarize_df['end'] = diarize_df['segment'].apply(lambda x: x.end)
+        return diarize_df
+def assign_word_speakers(diarize_df, transcript_result, fill_nearest=False):
+    transcript_segments = transcript_result["segments"]
+    if transcript_segments and isinstance(transcript_segments[0], Segment):
+        transcript_segments = [seg.model_dump() for seg in transcript_segments]
+    for seg in transcript_segments:
+        # assign speaker to segment (if any)
+        diarize_df['intersection'] = np.minimum(diarize_df['end'], seg['end']) - np.maximum(diarize_df['start'],
+                                                                                            seg['start'])
+        diarize_df['union'] = np.maximum(diarize_df['end'], seg['end']) - np.minimum(diarize_df['start'], seg['start'])
+        intersected = diarize_df[diarize_df["intersection"] > 0]
+        speaker = None
+        if len(intersected) > 0:
+            # Choosing most strong intersection
+            speaker = intersected.groupby("speaker")["intersection"].sum().sort_values(ascending=False).index[0]
+        elif fill_nearest:
+            # Otherwise choosing closest
+            speaker = diarize_df.sort_values(by=["intersection"], ascending=False)["speaker"].values[0]
+        if speaker is not None:
+            seg["speaker"] = speaker
+        # assign speaker to words
+        if 'words' in seg and seg['words'] is not None:
+            for word in seg['words']:
+                if 'start' in word:
+                    diarize_df['intersection'] = np.minimum(diarize_df['end'], word['end']) - np.maximum(
+                        diarize_df['start'], word['start'])
+                    diarize_df['union'] = np.maximum(diarize_df['end'], word['end']) - np.minimum(diarize_df['start'],
+                                                                                                  word['start'])
+                    intersected = diarize_df[diarize_df["intersection"] > 0]
+                    word_speaker = None
+                    if len(intersected) > 0:
+                        # Choosing most strong intersection
+                        word_speaker = \
+                            intersected.groupby("speaker")["intersection"].sum().sort_values(ascending=False).index[0]
+                    elif fill_nearest:
+                        # Otherwise choosing closest
+                        word_speaker = diarize_df.sort_values(by=["intersection"], ascending=False)["speaker"].values[0]
+                    if word_speaker is not None:
+                        word["speaker"] = word_speaker
+    return {"segments": transcript_segments}
+class DiarizationSegment:
+    def __init__(self, start, end, speaker=None):
+        self.start = start
+        self.end = end
+        self.speaker = speaker

modules/diarize/diarizer.py ADDED Viewed

	@@ -0,0 +1,145 @@

+import os
+import torch
+from typing import List, Union, BinaryIO, Optional, Tuple
+import numpy as np
+import time
+import logging
+import spaces
+from modules.utils.paths import DIARIZATION_MODELS_DIR
+from modules.diarize.diarize_pipeline import DiarizationPipeline, assign_word_speakers
+from modules.diarize.audio_loader import load_audio
+from modules.whisper.data_classes import *
+class Diarizer:
+    def __init__(self,
+                 model_dir: str = DIARIZATION_MODELS_DIR
+                 ):
+        self.device = self.get_device()
+        self.available_device = self.get_available_device()
+        self.compute_type = "float16"
+        self.model_dir = model_dir
+        os.makedirs(self.model_dir, exist_ok=True)
+        self.pipe = None
+    @spaces.GPU
+    def run(self,
+            audio: Union[str, BinaryIO, np.ndarray],
+            transcribed_result: List[Segment],
+            use_auth_token: str,
+            device: Optional[str] = None
+            ) -> Tuple[List[Segment], float]:
+        """
+        Diarize transcribed result as a post-processing
+        Parameters
+        ----------
+        audio: Union[str, BinaryIO, np.ndarray]
+            Audio input. This can be file path or binary type.
+        transcribed_result: List[Segment]
+            transcribed result through whisper.
+        use_auth_token: str
+            Huggingface token with READ permission. This is only needed the first time you download the model.
+            You must manually go to the website https://huggingface.co/pyannote/speaker-diarization-3.1 and agree to their TOS to download the model.
+        device: Optional[str]
+            Device for diarization.
+        Returns
+        ----------
+        segments_result: List[Segment]
+            list of Segment that includes start, end timestamps and transcribed text
+        elapsed_time: float
+            elapsed time for running
+        """
+        start_time = time.time()
+        if device is None:
+            device = self.device
+        if device != self.device or self.pipe is None:
+            self.update_pipe(
+                device=device,
+                use_auth_token=use_auth_token
+            )
+        audio = load_audio(audio)
+        diarization_segments = self.pipe(audio)
+        diarized_result = assign_word_speakers(
+            diarization_segments,
+            {"segments": transcribed_result}
+        )
+        segments_result = []
+        for segment in diarized_result["segments"]:
+            speaker = "None"
+            if "speaker" in segment:
+                speaker = segment["speaker"]
+            diarized_text = speaker + "|" + segment["text"].strip()
+            segments_result.append(Segment(
+                start=segment["start"],
+                end=segment["end"],
+                text=diarized_text
+            ))
+        elapsed_time = time.time() - start_time
+        return segments_result, elapsed_time
+    @spaces.GPU
+    def update_pipe(self,
+                    use_auth_token: str,
+                    device: str
+                    ):
+        """
+        Set pipeline for diarization
+        Parameters
+        ----------
+        use_auth_token: str
+            Huggingface token with READ permission. This is only needed the first time you download the model.
+            You must manually go to the website https://huggingface.co/pyannote/speaker-diarization-3.1 and agree to their TOS to download the model.
+        device: str
+            Device for diarization.
+        """
+        self.device = device
+        os.makedirs(self.model_dir, exist_ok=True)
+        if (not os.listdir(self.model_dir) and
+                not use_auth_token):
+            print(
+                "\nFailed to diarize. You need huggingface token and agree to their requirements to download the diarization model.\n"
+                "Go to \"https://huggingface.co/pyannote/speaker-diarization-3.1\" and follow their instructions to download the model.\n"
+            )
+            return
+        logger = logging.getLogger("speechbrain.utils.train_logger")
+        # Disable redundant torchvision warning message
+        logger.disabled = True
+        self.pipe = DiarizationPipeline(
+            use_auth_token=use_auth_token,
+            device=device,
+            cache_dir=self.model_dir
+        )
+        logger.disabled = False
+    @staticmethod
+    @spaces.GPU
+    def get_device():
+        if torch.cuda.is_available():
+            return "cuda"
+        elif torch.backends.mps.is_available():
+            return "mps"
+        else:
+            return "cpu"
+    @staticmethod
+    @spaces.GPU
+    def get_available_device():
+        devices = ["cpu"]
+        if torch.cuda.is_available():
+            devices.append("cuda")
+        elif torch.backends.mps.is_available():
+            devices.append("mps")
+        return devices

modules/translation/__init__.py ADDED Viewed

File without changes

modules/translation/deepl_api.py ADDED Viewed

	@@ -0,0 +1,217 @@

+import requests
+import time
+import os
+from datetime import datetime
+import gradio as gr
+from modules.utils.paths import TRANSLATION_OUTPUT_DIR, DEFAULT_PARAMETERS_CONFIG_PATH
+from modules.utils.constants import AUTOMATIC_DETECTION
+from modules.utils.subtitle_manager import *
+from modules.utils.files_manager import load_yaml, save_yaml
+"""
+This is written with reference to the DeepL API documentation.
+If you want to know the information of the DeepL API, see here: https://www.deepl.com/docs-api/documents
+"""
+DEEPL_AVAILABLE_TARGET_LANGS = {
+    'Bulgarian': 'BG',
+    'Czech': 'CS',
+    'Danish': 'DA',
+    'German': 'DE',
+    'Greek': 'EL',
+    'English': 'EN',
+    'English (British)': 'EN-GB',
+    'English (American)': 'EN-US',
+    'Spanish': 'ES',
+    'Estonian': 'ET',
+    'Finnish': 'FI',
+    'French': 'FR',
+    'Hungarian': 'HU',
+    'Indonesian': 'ID',
+    'Italian': 'IT',
+    'Japanese': 'JA',
+    'Korean': 'KO',
+    'Lithuanian': 'LT',
+    'Latvian': 'LV',
+    'Norwegian (Bokmål)': 'NB',
+    'Dutch': 'NL',
+    'Polish': 'PL',
+    'Portuguese': 'PT',
+    'Portuguese (Brazilian)': 'PT-BR',
+    'Portuguese (all Portuguese varieties excluding Brazilian Portuguese)': 'PT-PT',
+    'Romanian': 'RO',
+    'Russian': 'RU',
+    'Slovak': 'SK',
+    'Slovenian': 'SL',
+    'Swedish': 'SV',
+    'Turkish': 'TR',
+    'Ukrainian': 'UK',
+    'Chinese (simplified)': 'ZH'
+}
+DEEPL_AVAILABLE_SOURCE_LANGS = {
+    AUTOMATIC_DETECTION: None,
+    'Bulgarian': 'BG',
+    'Czech': 'CS',
+    'Danish': 'DA',
+    'German': 'DE',
+    'Greek': 'EL',
+    'English': 'EN',
+    'Spanish': 'ES',
+    'Estonian': 'ET',
+    'Finnish': 'FI',
+    'French': 'FR',
+    'Hungarian': 'HU',
+    'Indonesian': 'ID',
+    'Italian': 'IT',
+    'Japanese': 'JA',
+    'Korean': 'KO',
+    'Lithuanian': 'LT',
+    'Latvian': 'LV',
+    'Norwegian (Bokmål)': 'NB',
+    'Dutch': 'NL',
+    'Polish': 'PL',
+    'Portuguese (all Portuguese varieties mixed)': 'PT',
+    'Romanian': 'RO',
+    'Russian': 'RU',
+    'Slovak': 'SK',
+    'Slovenian': 'SL',
+    'Swedish': 'SV',
+    'Turkish': 'TR',
+    'Ukrainian': 'UK',
+    'Chinese': 'ZH'
+}
+class DeepLAPI:
+    def __init__(self,
+                 output_dir: str = TRANSLATION_OUTPUT_DIR
+                 ):
+        self.api_interval = 1
+        self.max_text_batch_size = 50
+        self.available_target_langs = DEEPL_AVAILABLE_TARGET_LANGS
+        self.available_source_langs = DEEPL_AVAILABLE_SOURCE_LANGS
+        self.output_dir = output_dir
+    def translate_deepl(self,
+                        auth_key: str,
+                        fileobjs: list,
+                        source_lang: str,
+                        target_lang: str,
+                        is_pro: bool = False,
+                        add_timestamp: bool = True,
+                        progress=gr.Progress()) -> list:
+        """
+        Translate subtitle files using DeepL API
+        Parameters
+        ----------
+        auth_key: str
+            API Key for DeepL from gr.Textbox()
+        fileobjs: list
+            List of files to transcribe from gr.Files()
+        source_lang: str
+            Source language of the file to transcribe from gr.Dropdown()
+        target_lang: str
+            Target language of the file to transcribe from gr.Dropdown()
+        is_pro: str
+            Boolean value that is about pro user or not from gr.Checkbox().
+        add_timestamp: bool
+            Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename.
+        progress: gr.Progress
+            Indicator to show progress directly in gradio.
+        Returns
+        ----------
+        A List of
+        String to return to gr.Textbox()
+        Files to return to gr.Files()
+        """
+        if fileobjs and isinstance(fileobjs[0], gr.utils.NamedString):
+            fileobjs = [fileobj.name for fileobj in fileobjs]
+        self.cache_parameters(
+            api_key=auth_key,
+            is_pro=is_pro,
+            source_lang=source_lang,
+            target_lang=target_lang,
+            add_timestamp=add_timestamp
+        )
+        files_info = {}
+        for file_path in fileobjs:
+            file_name, file_ext = os.path.splitext(os.path.basename(file_path))
+            writer = get_writer(file_ext, self.output_dir)
+            segments = writer.to_segments(file_path)
+            batch_size = self.max_text_batch_size
+            for batch_start in range(0, len(segments), batch_size):
+                progress(batch_start / len(segments), desc="Translating..")
+                sentences_to_translate = [seg.text for seg in segments[batch_start:batch_start+batch_size]]
+                translated_texts = self.request_deepl_translate(auth_key, sentences_to_translate, source_lang,
+                                                                target_lang, is_pro)
+                for i, translated_text in enumerate(translated_texts):
+                    segments[batch_start + i].text = translated_text["text"]
+            subtitle, output_path = generate_file(
+                output_dir=self.output_dir,
+                output_file_name=file_name,
+                output_format=file_ext,
+                result=segments,
+                add_timestamp=add_timestamp
+            )
+            files_info[file_name] = {"subtitle": subtitle, "path": output_path}
+        total_result = ''
+        for file_name, info in files_info.items():
+            total_result += '------------------------------------\n'
+            total_result += f'{file_name}\n\n'
+            total_result += f'{info["subtitle"]}'
+        gr_str = f"Done! Subtitle is in the outputs/translation folder.\n\n{total_result}"
+        output_file_paths = [item["path"] for key, item in files_info.items()]
+        return [gr_str, output_file_paths]
+    def request_deepl_translate(self,
+                                auth_key: str,
+                                text: list,
+                                source_lang: str,
+                                target_lang: str,
+                                is_pro: bool = False):
+        """Request API response to DeepL server"""
+        if source_lang not in list(DEEPL_AVAILABLE_SOURCE_LANGS.keys()):
+            raise ValueError(f"Source language {source_lang} is not supported."
+                             f"Use one of {list(DEEPL_AVAILABLE_SOURCE_LANGS.keys())}")
+        if target_lang not in list(DEEPL_AVAILABLE_TARGET_LANGS.keys()):
+            raise ValueError(f"Target language {target_lang} is not supported."
+                             f"Use one of {list(DEEPL_AVAILABLE_TARGET_LANGS.keys())}")
+        url = 'https://api.deepl.com/v2/translate' if is_pro else 'https://api-free.deepl.com/v2/translate'
+        headers = {
+            'Authorization': f'DeepL-Auth-Key {auth_key}'
+        }
+        data = {
+            'text': text,
+            'source_lang': DEEPL_AVAILABLE_SOURCE_LANGS[source_lang],
+            'target_lang': DEEPL_AVAILABLE_TARGET_LANGS[target_lang]
+        }
+        response = requests.post(url, headers=headers, data=data).json()
+        time.sleep(self.api_interval)
+        return response["translations"]
+    @staticmethod
+    def cache_parameters(api_key: str,
+                         is_pro: bool,
+                         source_lang: str,
+                         target_lang: str,
+                         add_timestamp: bool):
+        cached_params = load_yaml(DEFAULT_PARAMETERS_CONFIG_PATH)
+        cached_params["translation"]["deepl"] = {
+            "api_key": api_key,
+            "is_pro": is_pro,
+            "source_lang": source_lang,
+            "target_lang": target_lang
+        }
+        cached_params["translation"]["add_timestamp"] = add_timestamp
+        save_yaml(cached_params, DEFAULT_PARAMETERS_CONFIG_PATH)

modules/translation/nllb_inference.py ADDED Viewed

	@@ -0,0 +1,289 @@

+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
+import gradio as gr
+import os
+import spaces
+from modules.utils.paths import TRANSLATION_OUTPUT_DIR, NLLB_MODELS_DIR
+import modules.translation.translation_base as base
+class NLLBInference(base.TranslationBase):
+    def __init__(self,
+                 model_dir: str = NLLB_MODELS_DIR,
+                 output_dir: str = TRANSLATION_OUTPUT_DIR
+                 ):
+        super().__init__(
+            model_dir=model_dir,
+            output_dir=output_dir
+        )
+        self.tokenizer = None
+        self.available_models = ["facebook/nllb-200-3.3B", "facebook/nllb-200-1.3B", "facebook/nllb-200-distilled-600M"]
+        self.available_source_langs = list(NLLB_AVAILABLE_LANGS.keys())
+        self.available_target_langs = list(NLLB_AVAILABLE_LANGS.keys())
+        self.pipeline = None
+    @spaces.GPU(duration=120)
+    def translate(self,
+                  text: str,
+                  max_length: int
+                  ):
+        result = self.pipeline(
+            text,
+            max_length=max_length
+        )
+        return result[0]["translation_text"]
+    @spaces.GPU(duration=120)
+    def update_model(self,
+                     model_size: str,
+                     src_lang: str,
+                     tgt_lang: str,
+                     progress: gr.Progress = gr.Progress()
+                     ):
+        def validate_language(lang: str) -> str:
+            if lang in NLLB_AVAILABLE_LANGS:
+                return NLLB_AVAILABLE_LANGS[lang]
+            elif lang not in NLLB_AVAILABLE_LANGS.values():
+                raise ValueError(f"Language '{lang}' is not supported. Use one of: {list(NLLB_AVAILABLE_LANGS.keys())}")
+            return lang
+        src_lang = validate_language(src_lang)
+        tgt_lang = validate_language(tgt_lang)
+        if model_size != self.current_model_size or self.model is None:
+            print("\nInitializing NLLB Model..\n")
+            progress(0, desc="Initializing NLLB Model..")
+            self.current_model_size = model_size
+            local_files_only = self.is_model_exists(self.current_model_size)
+            self.model = AutoModelForSeq2SeqLM.from_pretrained(pretrained_model_name_or_path=model_size,
+                                                               cache_dir=self.model_dir,
+                                                               local_files_only=local_files_only)
+            self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_size,
+                                                           cache_dir=os.path.join(self.model_dir, "tokenizers"),
+                                                           local_files_only=local_files_only)
+        self.pipeline = pipeline("translation",
+                                 model=self.model,
+                                 tokenizer=self.tokenizer,
+                                 src_lang=src_lang,
+                                 tgt_lang=tgt_lang,
+                                 device=self.device)
+    def is_model_exists(self,
+                        model_size: str):
+        """Check if model exists or not (Only facebook model)"""
+        prefix = "models--facebook--"
+        _id, model_size_name = model_size.split("/")
+        model_dir_name = prefix + model_size_name
+        model_dir_path = os.path.join(self.model_dir, model_dir_name)
+        if os.path.exists(model_dir_path) and os.listdir(model_dir_path):
+            return True
+        return False
+NLLB_AVAILABLE_LANGS = {
+    "Acehnese (Arabic script)": "ace_Arab",
+    "Acehnese (Latin script)": "ace_Latn",
+    "Mesopotamian Arabic": "acm_Arab",
+    "Ta’izzi-Adeni Arabic": "acq_Arab",
+    "Tunisian Arabic": "aeb_Arab",
+    "Afrikaans": "afr_Latn",
+    "South Levantine Arabic": "ajp_Arab",
+    "Akan": "aka_Latn",
+    "Amharic": "amh_Ethi",
+    "North Levantine Arabic": "apc_Arab",
+    "Modern Standard Arabic": "arb_Arab",
+    "Modern Standard Arabic (Romanized)": "arb_Latn",
+    "Najdi Arabic": "ars_Arab",
+    "Moroccan Arabic": "ary_Arab",
+    "Egyptian Arabic": "arz_Arab",
+    "Assamese": "asm_Beng",
+    "Asturian": "ast_Latn",
+    "Awadhi": "awa_Deva",
+    "Central Aymara": "ayr_Latn",
+    "South Azerbaijani": "azb_Arab",
+    "North Azerbaijani": "azj_Latn",
+    "Bashkir": "bak_Cyrl",
+    "Bambara": "bam_Latn",
+    "Balinese": "ban_Latn",
+    "Belarusian": "bel_Cyrl",
+    "Bemba": "bem_Latn",
+    "Bengali": "ben_Beng",
+    "Bhojpuri": "bho_Deva",
+    "Banjar (Arabic script)": "bjn_Arab",
+    "Banjar (Latin script)": "bjn_Latn",
+    "Standard Tibetan": "bod_Tibt",
+    "Bosnian": "bos_Latn",
+    "Buginese": "bug_Latn",
+    "Bulgarian": "bul_Cyrl",
+    "Catalan": "cat_Latn",
+    "Cebuano": "ceb_Latn",
+    "Czech": "ces_Latn",
+    "Chokwe": "cjk_Latn",
+    "Central Kurdish": "ckb_Arab",
+    "Crimean Tatar": "crh_Latn",
+    "Welsh": "cym_Latn",
+    "Danish": "dan_Latn",
+    "German": "deu_Latn",
+    "Southwestern Dinka": "dik_Latn",
+    "Dyula": "dyu_Latn",
+    "Dzongkha": "dzo_Tibt",
+    "Greek": "ell_Grek",
+    "English": "eng_Latn",
+    "Esperanto": "epo_Latn",
+    "Estonian": "est_Latn",
+    "Basque": "eus_Latn",
+    "Ewe": "ewe_Latn",
+    "Faroese": "fao_Latn",
+    "Fijian": "fij_Latn",
+    "Finnish": "fin_Latn",
+    "Fon": "fon_Latn",
+    "French": "fra_Latn",
+    "Friulian": "fur_Latn",
+    "Nigerian Fulfulde": "fuv_Latn",
+    "Scottish Gaelic": "gla_Latn",
+    "Irish": "gle_Latn",
+    "Galician": "glg_Latn",
+    "Guarani": "grn_Latn",
+    "Gujarati": "guj_Gujr",
+    "Haitian Creole": "hat_Latn",
+    "Hausa": "hau_Latn",
+    "Hebrew": "heb_Hebr",
+    "Hindi": "hin_Deva",
+    "Chhattisgarhi": "hne_Deva",
+    "Croatian": "hrv_Latn",
+    "Hungarian": "hun_Latn",
+    "Armenian": "hye_Armn",
+    "Igbo": "ibo_Latn",
+    "Ilocano": "ilo_Latn",
+    "Indonesian": "ind_Latn",
+    "Icelandic": "isl_Latn",
+    "Italian": "ita_Latn",
+    "Javanese": "jav_Latn",
+    "Japanese": "jpn_Jpan",
+    "Kabyle": "kab_Latn",
+    "Jingpho": "kac_Latn",
+    "Kamba": "kam_Latn",
+    "Kannada": "kan_Knda",
+    "Kashmiri (Arabic script)": "kas_Arab",
+    "Kashmiri (Devanagari script)": "kas_Deva",
+    "Georgian": "kat_Geor",
+    "Central Kanuri (Arabic script)": "knc_Arab",
+    "Central Kanuri (Latin script)": "knc_Latn",
+    "Kazakh": "kaz_Cyrl",
+    "Kabiyè": "kbp_Latn",
+    "Kabuverdianu": "kea_Latn",
+    "Khmer": "khm_Khmr",
+    "Kikuyu": "kik_Latn",
+    "Kinyarwanda": "kin_Latn",
+    "Kyrgyz": "kir_Cyrl",
+    "Kimbundu": "kmb_Latn",
+    "Northern Kurdish": "kmr_Latn",
+    "Kikongo": "kon_Latn",
+    "Korean": "kor_Hang",
+    "Lao": "lao_Laoo",
+    "Ligurian": "lij_Latn",
+    "Limburgish": "lim_Latn",
+    "Lingala": "lin_Latn",
+    "Lithuanian": "lit_Latn",
+    "Lombard": "lmo_Latn",
+    "Latgalian": "ltg_Latn",
+    "Luxembourgish": "ltz_Latn",
+    "Luba-Kasai": "lua_Latn",
+    "Ganda": "lug_Latn",
+    "Luo": "luo_Latn",
+    "Mizo": "lus_Latn",
+    "Standard Latvian": "lvs_Latn",
+    "Magahi": "mag_Deva",
+    "Maithili": "mai_Deva",
+    "Malayalam": "mal_Mlym",
+    "Marathi": "mar_Deva",
+    "Minangkabau (Arabic script)": "min_Arab",
+    "Minangkabau (Latin script)": "min_Latn",
+    "Macedonian": "mkd_Cyrl",
+    "Plateau Malagasy": "plt_Latn",
+    "Maltese": "mlt_Latn",
+    "Meitei (Bengali script)": "mni_Beng",
+    "Halh Mongolian": "khk_Cyrl",
+    "Mossi": "mos_Latn",
+    "Maori": "mri_Latn",
+    "Burmese": "mya_Mymr",
+    "Dutch": "nld_Latn",
+    "Norwegian Nynorsk": "nno_Latn",
+    "Norwegian Bokmål": "nob_Latn",
+    "Nepali": "npi_Deva",
+    "Northern Sotho": "nso_Latn",
+    "Nuer": "nus_Latn",
+    "Nyanja": "nya_Latn",
+    "Occitan": "oci_Latn",
+    "West Central Oromo": "gaz_Latn",
+    "Odia": "ory_Orya",
+    "Pangasinan": "pag_Latn",
+    "Eastern Panjabi": "pan_Guru",
+    "Papiamento": "pap_Latn",
+    "Western Persian": "pes_Arab",
+    "Polish": "pol_Latn",
+    "Portuguese": "por_Latn",
+    "Dari": "prs_Arab",
+    "Southern Pashto": "pbt_Arab",
+    "Ayacucho Quechua": "quy_Latn",
+    "Romanian": "ron_Latn",
+    "Rundi": "run_Latn",
+    "Russian": "rus_Cyrl",
+    "Sango": "sag_Latn",
+    "Sanskrit": "san_Deva",
+    "Santali": "sat_Olck",
+    "Sicilian": "scn_Latn",
+    "Shan": "shn_Mymr",
+    "Sinhala": "sin_Sinh",
+    "Slovak": "slk_Latn",
+    "Slovenian": "slv_Latn",
+    "Samoan": "smo_Latn",
+    "Shona": "sna_Latn",
+    "Sindhi": "snd_Arab",
+    "Somali": "som_Latn",
+    "Southern Sotho": "sot_Latn",
+    "Spanish": "spa_Latn",
+    "Tosk Albanian": "als_Latn",
+    "Sardinian": "srd_Latn",
+    "Serbian": "srp_Cyrl",
+    "Swati": "ssw_Latn",
+    "Sundanese": "sun_Latn",
+    "Swedish": "swe_Latn",
+    "Swahili": "swh_Latn",
+    "Silesian": "szl_Latn",
+    "Tamil": "tam_Taml",
+    "Tatar": "tat_Cyrl",
+    "Telugu": "tel_Telu",
+    "Tajik": "tgk_Cyrl",
+    "Tagalog": "tgl_Latn",
+    "Thai": "tha_Thai",
+    "Tigrinya": "tir_Ethi",
+    "Tamasheq (Latin script)": "taq_Latn",
+    "Tamasheq (Tifinagh script)": "taq_Tfng",
+    "Tok Pisin": "tpi_Latn",
+    "Tswana": "tsn_Latn",
+    "Tsonga": "tso_Latn",
+    "Turkmen": "tuk_Latn",
+    "Tumbuka": "tum_Latn",
+    "Turkish": "tur_Latn",
+    "Twi": "twi_Latn",
+    "Central Atlas Tamazight": "tzm_Tfng",
+    "Uyghur": "uig_Arab",
+    "Ukrainian": "ukr_Cyrl",
+    "Umbundu": "umb_Latn",
+    "Urdu": "urd_Arab",
+    "Northern Uzbek": "uzn_Latn",
+    "Venetian": "vec_Latn",
+    "Vietnamese": "vie_Latn",
+    "Waray": "war_Latn",
+    "Wolof": "wol_Latn",
+    "Xhosa": "xho_Latn",
+    "Eastern Yiddish": "ydd_Hebr",
+    "Yoruba": "yor_Latn",
+    "Yue Chinese": "yue_Hant",
+    "Chinese (Simplified)": "zho_Hans",
+    "Chinese (Traditional)": "zho_Hant",
+    "Standard Malay": "zsm_Latn",
+    "Zulu": "zul_Latn",
+}

modules/translation/translation_base.py ADDED Viewed

	@@ -0,0 +1,181 @@

+import os
+import torch
+import gradio as gr
+from abc import ABC, abstractmethod
+from typing import List
+from datetime import datetime
+import spaces
+import modules.translation.nllb_inference as nllb
+from modules.whisper.data_classes import *
+from modules.utils.subtitle_manager import *
+from modules.utils.files_manager import load_yaml, save_yaml
+from modules.utils.paths import DEFAULT_PARAMETERS_CONFIG_PATH, NLLB_MODELS_DIR, TRANSLATION_OUTPUT_DIR
+class TranslationBase(ABC):
+    def __init__(self,
+                 model_dir: str = NLLB_MODELS_DIR,
+                 output_dir: str = TRANSLATION_OUTPUT_DIR
+                 ):
+        super().__init__()
+        self.model = None
+        self.model_dir = model_dir
+        self.output_dir = output_dir
+        os.makedirs(self.model_dir, exist_ok=True)
+        os.makedirs(self.output_dir, exist_ok=True)
+        self.current_model_size = None
+        self.device = self.get_device()
+    @abstractmethod
+    @spaces.GPU(duration=120)
+    def translate(self,
+                  text: str,
+                  max_length: int
+                  ):
+        pass
+    @abstractmethod
+    @spaces.GPU(duration=120)
+    def update_model(self,
+                     model_size: str,
+                     src_lang: str,
+                     tgt_lang: str,
+                     progress: gr.Progress = gr.Progress()
+                     ):
+        pass
+    @spaces.GPU(duration=120)
+    def translate_file(self,
+                       fileobjs: list,
+                       model_size: str,
+                       src_lang: str,
+                       tgt_lang: str,
+                       max_length: int = 200,
+                       add_timestamp: bool = True,
+                       progress=gr.Progress()) -> list:
+        """
+        Translate subtitle file from source language to target language
+        Parameters
+        ----------
+        fileobjs: list
+            List of files to transcribe from gr.Files()
+        model_size: str
+            Whisper model size from gr.Dropdown()
+        src_lang: str
+            Source language of the file to translate from gr.Dropdown()
+        tgt_lang: str
+            Target language of the file to translate from gr.Dropdown()
+        max_length: int
+            Max length per line to translate
+        add_timestamp: bool
+            Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename.
+        progress: gr.Progress
+            Indicator to show progress directly in gradio.
+            I use a forked version of whisper for this. To see more info : https://github.com/jhj0517/jhj0517-whisper/tree/add-progress-callback
+        Returns
+        ----------
+        A List of
+        String to return to gr.Textbox()
+        Files to return to gr.Files()
+        """
+        try:
+            if fileobjs and isinstance(fileobjs[0], gr.utils.NamedString):
+                fileobjs = [file.name for file in fileobjs]
+            self.cache_parameters(model_size=model_size,
+                                  src_lang=src_lang,
+                                  tgt_lang=tgt_lang,
+                                  max_length=max_length,
+                                  add_timestamp=add_timestamp)
+            self.update_model(model_size=model_size,
+                              src_lang=src_lang,
+                              tgt_lang=tgt_lang,
+                              progress=progress)
+            files_info = {}
+            for fileobj in fileobjs:
+                file_name, file_ext = os.path.splitext(os.path.basename(fileobj))
+                writer = get_writer(file_ext, self.output_dir)
+                segments = writer.to_segments(fileobj)
+                for i, segment in enumerate(segments):
+                    progress(i / len(segments), desc="Translating..")
+                    translated_text = self.translate(segment.text, max_length=max_length)
+                    segment.text = translated_text
+                subtitle, file_path = generate_file(
+                    output_dir=self.output_dir,
+                    output_file_name=file_name,
+                    output_format=file_ext,
+                    result=segments,
+                    add_timestamp=add_timestamp
+                )
+                files_info[file_name] = {"subtitle": subtitle, "path": file_path}
+            total_result = ''
+            for file_name, info in files_info.items():
+                total_result += '------------------------------------\n'
+                total_result += f'{file_name}\n\n'
+                total_result += f'{info["subtitle"]}'
+            gr_str = f"Done! Subtitle is in the outputs/translation folder.\n\n{total_result}"
+            output_file_paths = [item["path"] for key, item in files_info.items()]
+            return [gr_str, output_file_paths]
+        except Exception as e:
+            print(f"Error translating file: {e}")
+            raise
+        finally:
+            self.release_cuda_memory()
+    @staticmethod
+    @spaces.GPU(duration=120)
+    def get_device():
+        if torch.cuda.is_available():
+            return "cuda"
+        elif torch.backends.mps.is_available():
+            return "mps"
+        else:
+            return "cpu"
+    @staticmethod
+    @spaces.GPU(duration=120)
+    def release_cuda_memory():
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            torch.cuda.reset_max_memory_allocated()
+    @staticmethod
+    def remove_input_files(file_paths: List[str]):
+        if not file_paths:
+            return
+        for file_path in file_paths:
+            if file_path and os.path.exists(file_path):
+                os.remove(file_path)
+    @staticmethod
+    def cache_parameters(model_size: str,
+                         src_lang: str,
+                         tgt_lang: str,
+                         max_length: int,
+                         add_timestamp: bool):
+        def validate_lang(lang: str):
+            if lang in list(nllb.NLLB_AVAILABLE_LANGS.values()):
+                flipped = {value: key for key, value in nllb.NLLB_AVAILABLE_LANGS.items()}
+                return flipped[lang]
+            return lang
+        cached_params = load_yaml(DEFAULT_PARAMETERS_CONFIG_PATH)
+        cached_params["translation"]["nllb"] = {
+            "model_size": model_size,
+            "source_lang": validate_lang(src_lang),
+            "target_lang": validate_lang(tgt_lang),
+            "max_length": max_length,
+        }
+        cached_params["translation"]["add_timestamp"] = add_timestamp
+        save_yaml(cached_params, DEFAULT_PARAMETERS_CONFIG_PATH)

modules/ui/__init__.py ADDED Viewed

File without changes

modules/ui/htmls.py ADDED Viewed

	@@ -0,0 +1,97 @@

+CSS = """
+.bmc-button {
+    padding: 2px 5px;
+    border-radius: 5px;
+    background-color: #FF813F;
+    color: white;
+    box-shadow: 0px 1px 2px rgba(0, 0, 0, 0.3);
+    text-decoration: none;
+    display: inline-block;
+    font-size: 20px;
+    margin: 2px;
+    cursor: pointer;
+    -webkit-transition: background-color 0.3s ease;
+    -ms-transition: background-color 0.3s ease;
+    transition: background-color 0.3s ease;
+}
+.bmc-button:hover,
+.bmc-button:active,
+.bmc-button:focus {
+    background-color: #FF5633;
+}
+.markdown {
+    margin-bottom: 0;
+    padding-bottom: 0;
+}
+.tabs {
+    margin-top: 0;
+    padding-top: 0;
+}
+#md_project a {
+  color: black;
+  text-decoration: none;
+}
+#md_project a:hover {
+  text-decoration: underline;
+}
+"""
+MARKDOWN = """
+### [Whisper-WebUI](https://github.com/jhj0517/Whsiper-WebUI)
+"""
+NLLB_VRAM_TABLE = """
+<!DOCTYPE html>
+<html lang="en">
+<head>
+  <meta charset="UTF-8">
+  <meta name="viewport" content="width=device-width, initial-scale=1.0">
+  <style>
+    table {
+      border-collapse: collapse;
+      width: 100%;
+    }
+    th, td {
+      border: 1px solid #dddddd;
+      text-align: left;
+      padding: 8px;
+    }
+    th {
+      background-color: #f2f2f2;
+    }
+  </style>
+</head>
+<body>
+<details>
+  <summary>VRAM usage for each model</summary>
+  <table>
+    <thead>
+      <tr>
+        <th>Model name</th>
+        <th>Required VRAM</th>
+      </tr>
+    </thead>
+    <tbody>
+      <tr>
+        <td>nllb-200-3.3B</td>
+        <td>~16GB</td>
+      </tr>
+      <tr>
+        <td>nllb-200-1.3B</td>
+        <td>~8GB</td>
+      </tr>
+      <tr>
+        <td>nllb-200-distilled-600M</td>
+        <td>~4GB</td>
+      </tr>
+    </tbody>
+  </table>
+  <p><strong>Note:</strong> Be mindful of your VRAM! The table above provides an approximate VRAM usage for each model.</p>
+</details>
+</body>
+</html>
+"""

modules/utils/__init__.py ADDED Viewed

File without changes

modules/utils/cli_manager.py ADDED Viewed

	@@ -0,0 +1,12 @@

+import argparse
+def str2bool(v):
+    if isinstance(v, bool):
+        return v
+    if v.lower() in ('yes', 'true', 't', 'y', '1'):
+        return True
+    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
+        return False
+    else:
+        raise argparse.ArgumentTypeError('Boolean value expected.')

modules/utils/constants.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from gradio_i18n import Translate, gettext as _
+AUTOMATIC_DETECTION = _("Automatic Detection")
+GRADIO_NONE_STR = ""
+GRADIO_NONE_NUMBER_MAX = 9999
+GRADIO_NONE_NUMBER_MIN = 0

modules/utils/files_manager.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import os
+import fnmatch
+from ruamel.yaml import YAML
+from gradio.utils import NamedString
+from modules.utils.paths import DEFAULT_PARAMETERS_CONFIG_PATH
+def load_yaml(path: str = DEFAULT_PARAMETERS_CONFIG_PATH):
+    yaml = YAML(typ="safe")
+    yaml.preserve_quotes = True
+    with open(path, 'r', encoding='utf-8') as file:
+        config = yaml.load(file)
+    return config
+def save_yaml(data: dict, path: str = DEFAULT_PARAMETERS_CONFIG_PATH):
+    yaml = YAML(typ="safe")
+    yaml.map_indent = 2
+    yaml.sequence_indent = 4
+    yaml.sequence_dash_offset = 2
+    yaml.preserve_quotes = True
+    yaml.default_flow_style = False
+    yaml.sort_base_mapping_type_on_output = False
+    with open(path, 'w', encoding='utf-8') as file:
+        yaml.dump(data, file)
+    return path
+def get_media_files(folder_path, include_sub_directory=False):
+    video_extensions = ['*.mp4', '*.mkv', '*.flv', '*.avi', '*.mov', '*.wmv', '*.webm', '*.m4v', '*.mpeg', '*.mpg',
+                        '*.3gp', '*.f4v', '*.ogv', '*.vob', '*.mts', '*.m2ts', '*.divx', '*.mxf', '*.rm', '*.rmvb']
+    audio_extensions = ['*.mp3', '*.wav', '*.aac', '*.flac', '*.ogg', '*.m4a']
+    media_extensions = video_extensions + audio_extensions
+    media_files = []
+    if include_sub_directory:
+        for root, _, files in os.walk(folder_path):
+            for extension in media_extensions:
+                media_files.extend(
+                    os.path.join(root, file) for file in fnmatch.filter(files, extension)
+                    if os.path.exists(os.path.join(root, file))
+                )
+    else:
+        for extension in media_extensions:
+            media_files.extend(
+                os.path.join(folder_path, file) for file in fnmatch.filter(os.listdir(folder_path), extension)
+                if os.path.isfile(os.path.join(folder_path, file)) and os.path.exists(os.path.join(folder_path, file))
+            )
+    return media_files
+def format_gradio_files(files: list):
+    if not files:
+        return files
+    gradio_files = []
+    for file in files:
+        gradio_files.append(NamedString(file))
+    return gradio_files
+def is_video(file_path):
+    video_extensions = ['.mp4', '.mkv', '.avi', '.mov', '.flv', '.wmv', '.webm', '.m4v', '.mpeg', '.mpg', '.3gp']
+    extension = os.path.splitext(file_path)[1].lower()
+    return extension in video_extensions
+def read_file(file_path):
+    with open(file_path, "r", encoding="utf-8") as f:
+        subtitle_content = f.read()
+    return subtitle_content

modules/utils/paths.py ADDED Viewed

	@@ -0,0 +1,32 @@

+import os
+WEBUI_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
+MODELS_DIR = os.path.join(WEBUI_DIR, "models")
+WHISPER_MODELS_DIR = os.path.join(MODELS_DIR, "Whisper")
+FASTER_WHISPER_MODELS_DIR = os.path.join(WHISPER_MODELS_DIR, "faster-whisper")
+INSANELY_FAST_WHISPER_MODELS_DIR = os.path.join(WHISPER_MODELS_DIR, "insanely-fast-whisper")
+NLLB_MODELS_DIR = os.path.join(MODELS_DIR, "NLLB")
+DIARIZATION_MODELS_DIR = os.path.join(MODELS_DIR, "Diarization")
+UVR_MODELS_DIR = os.path.join(MODELS_DIR, "UVR", "MDX_Net_Models")
+CONFIGS_DIR = os.path.join(WEBUI_DIR, "configs")
+DEFAULT_PARAMETERS_CONFIG_PATH = os.path.join(CONFIGS_DIR, "default_parameters.yaml")
+I18N_YAML_PATH = os.path.join(CONFIGS_DIR, "translation.yaml")
+OUTPUT_DIR = os.path.join(WEBUI_DIR, "outputs")
+TRANSLATION_OUTPUT_DIR = os.path.join(OUTPUT_DIR, "translations")
+UVR_OUTPUT_DIR = os.path.join(OUTPUT_DIR, "UVR")
+UVR_INSTRUMENTAL_OUTPUT_DIR = os.path.join(UVR_OUTPUT_DIR, "instrumental")
+UVR_VOCALS_OUTPUT_DIR = os.path.join(UVR_OUTPUT_DIR, "vocals")
+for dir_path in [MODELS_DIR,
+                 WHISPER_MODELS_DIR,
+                 FASTER_WHISPER_MODELS_DIR,
+                 INSANELY_FAST_WHISPER_MODELS_DIR,
+                 NLLB_MODELS_DIR,
+                 DIARIZATION_MODELS_DIR,
+                 UVR_MODELS_DIR,
+                 CONFIGS_DIR,
+                 OUTPUT_DIR,
+                 TRANSLATION_OUTPUT_DIR,
+                 UVR_INSTRUMENTAL_OUTPUT_DIR,
+                 UVR_VOCALS_OUTPUT_DIR]:
+    os.makedirs(dir_path, exist_ok=True)

modules/utils/subtitle_manager.py ADDED Viewed

	@@ -0,0 +1,438 @@

+# Ported from https://github.com/openai/whisper/blob/main/whisper/utils.py
+import json
+import os
+import re
+import sys
+import zlib
+from typing import Callable, List, Optional, TextIO, Union, Dict, Tuple
+from datetime import datetime
+from modules.whisper.data_classes import Segment, Word
+from .files_manager import read_file
+# Zero GPU
+import spaces
+def format_timestamp(
+    seconds: float, always_include_hours: bool = True, decimal_marker: str = ","
+) -> str:
+    assert seconds >= 0, "non-negative timestamp expected"
+    milliseconds = round(seconds * 1000.0)
+    hours = milliseconds // 3_600_000
+    milliseconds -= hours * 3_600_000
+    minutes = milliseconds // 60_000
+    milliseconds -= minutes * 60_000
+    seconds = milliseconds // 1_000
+    milliseconds -= seconds * 1_000
+    hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else ""
+    return (
+        f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}"
+    )
+def time_str_to_seconds(time_str: str, decimal_marker: str = ",") -> float:
+    times = time_str.split(":")
+    if len(times) == 3:
+        hours, minutes, rest = times
+        hours = int(hours)
+    else:
+        hours = 0
+        minutes, rest = times
+    seconds, fractional = rest.split(decimal_marker)
+    minutes = int(minutes)
+    seconds = int(seconds)
+    fractional_seconds = float("0." + fractional)
+    return hours * 3600 + minutes * 60 + seconds + fractional_seconds
+def get_start(segments: List[dict]) -> Optional[float]:
+    return next(
+        (w["start"] for s in segments for w in s["words"]),
+        segments[0]["start"] if segments else None,
+    )
+def get_end(segments: List[dict]) -> Optional[float]:
+    return next(
+        (w["end"] for s in reversed(segments) for w in reversed(s["words"])),
+        segments[-1]["end"] if segments else None,
+    )
+class ResultWriter:
+    extension: str
+    def __init__(self, output_dir: str):
+        self.output_dir = output_dir
+    def __call__(
+        self, result: Union[dict, List[Segment]], output_file_name: str,
+            options: Optional[dict] = None, **kwargs
+    ):
+        if isinstance(result, List) and result and isinstance(result[0], Segment):
+            result = {"segments": [seg.model_dump() for seg in result]}
+        output_path = os.path.join(
+            self.output_dir, output_file_name + "." + self.extension
+        )
+        with open(output_path, "w", encoding="utf-8") as f:
+            self.write_result(result, file=f, options=options, **kwargs)
+    def write_result(
+        self, result: dict, file: TextIO, options: Optional[dict] = None, **kwargs
+    ):
+        raise NotImplementedError
+class WriteTXT(ResultWriter):
+    extension: str = "txt"
+    def write_result(
+        self, result: Union[Dict, List[Segment]], file: TextIO, options: Optional[dict] = None, **kwargs
+    ):
+        for segment in result["segments"]:
+            print(segment["text"].strip(), file=file, flush=True)
+class SubtitlesWriter(ResultWriter):
+    always_include_hours: bool
+    decimal_marker: str
+    def iterate_result(
+        self,
+        result: dict,
+        options: Optional[dict] = None,
+        *,
+        max_line_width: Optional[int] = None,
+        max_line_count: Optional[int] = None,
+        highlight_words: bool = False,
+        align_lrc_words: bool = False,
+        max_words_per_line: Optional[int] = None,
+    ):
+        options = options or {}
+        max_line_width = max_line_width or options.get("max_line_width")
+        max_line_count = max_line_count or options.get("max_line_count")
+        highlight_words = highlight_words or options.get("highlight_words", False)
+        align_lrc_words = align_lrc_words or options.get("align_lrc_words", False)
+        max_words_per_line = max_words_per_line or options.get("max_words_per_line")
+        preserve_segments = max_line_count is None or max_line_width is None
+        max_line_width = max_line_width or 1000
+        max_words_per_line = max_words_per_line or 1000
+        def iterate_subtitles():
+            line_len = 0
+            line_count = 1
+            # the next subtitle to yield (a list of word timings with whitespace)
+            subtitle: List[dict] = []
+            last: float = get_start(result["segments"]) or 0.0
+            for segment in result["segments"]:
+                chunk_index = 0
+                words_count = max_words_per_line
+                while chunk_index < len(segment["words"]):
+                    remaining_words = len(segment["words"]) - chunk_index
+                    if max_words_per_line > len(segment["words"]) - chunk_index:
+                        words_count = remaining_words
+                    for i, original_timing in enumerate(
+                        segment["words"][chunk_index : chunk_index + words_count]
+                    ):
+                        timing = original_timing.copy()
+                        long_pause = (
+                            not preserve_segments and timing["start"] - last > 3.0
+                        )
+                        has_room = line_len + len(timing["word"]) <= max_line_width
+                        seg_break = i == 0 and len(subtitle) > 0 and preserve_segments
+                        if (
+                            line_len > 0
+                            and has_room
+                            and not long_pause
+                            and not seg_break
+                        ):
+                            # line continuation
+                            line_len += len(timing["word"])
+                        else:
+                            # new line
+                            timing["word"] = timing["word"].strip()
+                            if (
+                                len(subtitle) > 0
+                                and max_line_count is not None
+                                and (long_pause or line_count >= max_line_count)
+                                or seg_break
+                            ):
+                                # subtitle break
+                                yield subtitle
+                                subtitle = []
+                                line_count = 1
+                            elif line_len > 0:
+                                # line break
+                                line_count += 1
+                                timing["word"] = "\n" + timing["word"]
+                            line_len = len(timing["word"].strip())
+                        subtitle.append(timing)
+                        last = timing["start"]
+                    chunk_index += max_words_per_line
+            if len(subtitle) > 0:
+                yield subtitle
+        if len(result["segments"]) > 0 and "words" in result["segments"][0] and result["segments"][0]["words"]:
+            for subtitle in iterate_subtitles():
+                subtitle_start = self.format_timestamp(subtitle[0]["start"])
+                subtitle_end = self.format_timestamp(subtitle[-1]["end"])
+                subtitle_text = "".join([word["word"] for word in subtitle])
+                if highlight_words:
+                    last = subtitle_start
+                    all_words = [timing["word"] for timing in subtitle]
+                    for i, this_word in enumerate(subtitle):
+                        start = self.format_timestamp(this_word["start"])
+                        end = self.format_timestamp(this_word["end"])
+                        if last != start:
+                            yield last, start, subtitle_text
+                        yield start, end, "".join(
+                            [
+                                re.sub(r"^(\s*)(.*)$", r"\1<u>\2</u>", word)
+                                if j == i
+                                else word
+                                for j, word in enumerate(all_words)
+                            ]
+                        )
+                        last = end
+                if align_lrc_words:
+                    lrc_aligned_words = [f"[{self.format_timestamp(sub['start'])}]{sub['word']}" for sub in subtitle]
+                    l_start, l_end = self.format_timestamp(subtitle[-1]['start']), self.format_timestamp(subtitle[-1]['end'])
+                    lrc_aligned_words[-1] = f"[{l_start}]{subtitle[-1]['word']}[{l_end}]"
+                    lrc_aligned_words = ' '.join(lrc_aligned_words)
+                    yield None, None, lrc_aligned_words
+                else:
+                    yield subtitle_start, subtitle_end, subtitle_text
+        else:
+            for segment in result["segments"]:
+                segment_start = self.format_timestamp(segment["start"])
+                segment_end = self.format_timestamp(segment["end"])
+                segment_text = segment["text"].strip().replace("-->", "->")
+                yield segment_start, segment_end, segment_text
+    def format_timestamp(self, seconds: float):
+        return format_timestamp(
+            seconds=seconds,
+            always_include_hours=self.always_include_hours,
+            decimal_marker=self.decimal_marker,
+        )
+class WriteVTT(SubtitlesWriter):
+    extension: str = "vtt"
+    always_include_hours: bool = False
+    decimal_marker: str = "."
+    def write_result(
+        self, result: dict, file: TextIO, options: Optional[dict] = None, **kwargs
+    ):
+        print("WEBVTT\n", file=file)
+        for start, end, text in self.iterate_result(result, options, **kwargs):
+            print(f"{start} --> {end}\n{text}\n", file=file, flush=True)
+    def to_segments(self, file_path: str) -> List[Segment]:
+        segments = []
+        blocks = read_file(file_path).split('\n\n')
+        for block in blocks:
+            if block.strip() != '' and not block.strip().startswith("WEBVTT"):
+                lines = block.strip().split('\n')
+                time_line = lines[0].split(" --> ")
+                start, end = time_str_to_seconds(time_line[0], self.decimal_marker), time_str_to_seconds(time_line[1], self.decimal_marker)
+                sentence = ' '.join(lines[1:])
+                segments.append(Segment(
+                    start=start,
+                    end=end,
+                    text=sentence
+                ))
+        return segments
+class WriteSRT(SubtitlesWriter):
+    extension: str = "srt"
+    always_include_hours: bool = True
+    decimal_marker: str = ","
+    def write_result(
+        self, result: dict, file: TextIO, options: Optional[dict] = None, **kwargs
+    ):
+        for i, (start, end, text) in enumerate(
+            self.iterate_result(result, options, **kwargs), start=1
+        ):
+            print(f"{i}\n{start} --> {end}\n{text}\n", file=file, flush=True)
+    def to_segments(self, file_path: str) -> List[Segment]:
+        segments = []
+        blocks = read_file(file_path).split('\n\n')
+        for block in blocks:
+            if block.strip() != '':
+                lines = block.strip().split('\n')
+                index = lines[0]
+                time_line = lines[1].split(" --> ")
+                start, end = time_str_to_seconds(time_line[0], self.decimal_marker), time_str_to_seconds(time_line[1], self.decimal_marker)
+                sentence = ' '.join(lines[2:])
+                segments.append(Segment(
+                    start=start,
+                    end=end,
+                    text=sentence
+                ))
+        return segments
+class WriteLRC(SubtitlesWriter):
+    extension: str = "lrc"
+    always_include_hours: bool = False
+    decimal_marker: str = "."
+    def write_result(
+        self, result: dict, file: TextIO, options: Optional[dict] = None, **kwargs
+    ):
+        for i, (start, end, text) in enumerate(
+            self.iterate_result(result, options, **kwargs), start=1
+        ):
+            if "align_lrc_words" in kwargs and kwargs["align_lrc_words"]:
+                print(f"{text}\n", file=file, flush=True)
+            else:
+                print(f"[{start}]{text}[{end}]\n", file=file, flush=True)
+    def to_segments(self, file_path: str) -> List[Segment]:
+        segments = []
+        blocks = read_file(file_path).split('\n')
+        for block in blocks:
+            if block.strip() != '':
+                lines = block.strip()
+                pattern = r'(\[.*?\])'
+                parts = re.split(pattern, lines)
+                parts = [part.strip() for part in parts if part]
+                for i, part in enumerate(parts):
+                    sentence_i = i%2
+                    if sentence_i == 1:
+                        start_str, text, end_str = parts[sentence_i-1], parts[sentence_i], parts[sentence_i+1]
+                        start_str, end_str = start_str.replace("[", "").replace("]", ""), end_str.replace("[", "").replace("]", "")
+                        start, end = time_str_to_seconds(start_str, self.decimal_marker), time_str_to_seconds(end_str, self.decimal_marker)
+                        segments.append(Segment(
+                            start=start,
+                            end=end,
+                            text=text,
+                        ))
+        return segments
+class WriteTSV(ResultWriter):
+    """
+    Write a transcript to a file in TSV (tab-separated values) format containing lines like:
+    <start time in integer milliseconds>\t<end time in integer milliseconds>\t<transcript text>
+    Using integer milliseconds as start and end times means there's no chance of interference from
+    an environment setting a language encoding that causes the decimal in a floating point number
+    to appear as a comma; also is faster and more efficient to parse & store, e.g., in C++.
+    """
+    extension: str = "tsv"
+    def write_result(
+        self, result: dict, file: TextIO, options: Optional[dict] = None, **kwargs
+    ):
+        print("start", "end", "text", sep="\t", file=file)
+        for segment in result["segments"]:
+            print(round(1000 * segment["start"]), file=file, end="\t")
+            print(round(1000 * segment["end"]), file=file, end="\t")
+            print(segment["text"].strip().replace("\t", " "), file=file, flush=True)
+class WriteJSON(ResultWriter):
+    extension: str = "json"
+    def write_result(
+        self, result: dict, file: TextIO, options: Optional[dict] = None, **kwargs
+    ):
+        json.dump(result, file)
+def get_writer(
+    output_format: str, output_dir: str
+) -> Callable[[dict, TextIO, dict], None]:
+    output_format = output_format.strip().lower().replace(".", "")
+    writers = {
+        "txt": WriteTXT,
+        "vtt": WriteVTT,
+        "srt": WriteSRT,
+        "tsv": WriteTSV,
+        "json": WriteJSON,
+        "lrc": WriteLRC
+    }
+    if output_format == "all":
+        all_writers = [writer(output_dir) for writer in writers.values()]
+        def write_all(
+            result: dict, file: TextIO, options: Optional[dict] = None, **kwargs
+        ):
+            for writer in all_writers:
+                writer(result, file, options, **kwargs)
+        return write_all
+    return writers[output_format](output_dir)
+def generate_file(
+    output_format: str, output_dir: str, result: Union[dict, List[Segment]], output_file_name: str,
+    add_timestamp: bool = True, **kwargs
+) -> Tuple[str, str]:
+    output_format = output_format.strip().lower().replace(".", "")
+    output_format = "vtt" if output_format == "webvtt" else output_format
+    if add_timestamp:
+        timestamp = datetime.now().strftime("%m%d%H%M%S")
+        output_file_name += f"-{timestamp}"
+    file_path = os.path.join(output_dir, f"{output_file_name}.{output_format}")
+    file_writer = get_writer(output_format=output_format, output_dir=output_dir)
+    if isinstance(file_writer, WriteLRC) and kwargs.get("highlight_words", False):
+        kwargs["highlight_words"], kwargs["align_lrc_words"] = False, True
+    file_writer(result=result, output_file_name=output_file_name, **kwargs)
+    content = read_file(file_path)
+    return content, file_path
+@spaces.GPU(duration=120)
+def safe_filename(name):
+    INVALID_FILENAME_CHARS = r'[<>:"/\\|?*\x00-\x1f]'
+    safe_name = re.sub(INVALID_FILENAME_CHARS, '_', name)
+    # Truncate the filename if it exceeds the max_length (20)
+    if len(safe_name) > 20:
+        file_extension = safe_name.split('.')[-1]
+        if len(file_extension) + 1 < 20:
+            truncated_name = safe_name[:20 - len(file_extension) - 1]
+            safe_name = truncated_name + '.' + file_extension
+        else:
+            safe_name = safe_name[:20]
+    return safe_name

modules/utils/youtube_manager.py ADDED Viewed

	@@ -0,0 +1,33 @@

+from pytubefix import YouTube
+import subprocess
+import os
+def get_ytdata(link):
+    return YouTube(link)
+def get_ytmetas(link):
+    yt = YouTube(link)
+    return yt.thumbnail_url, yt.title, yt.description
+def get_ytaudio(ytdata: YouTube):
+    # Somehow the audio is corrupted so need to convert to valid audio file.
+    # Fix for : https://github.com/jhj0517/Whisper-WebUI/issues/304
+    audio_path = ytdata.streams.get_audio_only().download(filename=os.path.join("modules", "yt_tmp.wav"))
+    temp_audio_path = os.path.join("modules", "yt_tmp_fixed.wav")
+    try:
+        subprocess.run([
+            'ffmpeg', '-y',
+            '-i', audio_path,
+            temp_audio_path
+        ], check=True)
+        os.replace(temp_audio_path, audio_path)
+        return audio_path
+    except subprocess.CalledProcessError as e:
+        print(f"Error during ffmpeg conversion: {e}")
+        return None

modules/uvr/music_separator.py ADDED Viewed

	@@ -0,0 +1,183 @@

+from typing import Optional, Union, List, Dict
+import numpy as np
+import torchaudio
+import soundfile as sf
+import os
+import torch
+import gc
+import gradio as gr
+from datetime import datetime
+from uvr.models import MDX, Demucs, VrNetwork, MDXC
+from modules.utils.paths import DEFAULT_PARAMETERS_CONFIG_PATH
+from modules.utils.files_manager import load_yaml, save_yaml, is_video
+from modules.diarize.audio_loader import load_audio
+class MusicSeparator:
+    def __init__(self,
+                 model_dir: Optional[str] = None,
+                 output_dir: Optional[str] = None):
+        self.model = None
+        self.device = self.get_device()
+        self.available_devices = ["cpu", "cuda"]
+        self.model_dir = model_dir
+        self.output_dir = output_dir
+        instrumental_output_dir = os.path.join(self.output_dir, "instrumental")
+        vocals_output_dir = os.path.join(self.output_dir, "vocals")
+        os.makedirs(instrumental_output_dir, exist_ok=True)
+        os.makedirs(vocals_output_dir, exist_ok=True)
+        self.audio_info = None
+        self.available_models = ["UVR-MDX-NET-Inst_HQ_4", "UVR-MDX-NET-Inst_3"]
+        self.default_model = self.available_models[0]
+        self.current_model_size = self.default_model
+        self.model_config = {
+            "segment": 256,
+            "split": True
+        }
+    def update_model(self,
+                     model_name: str = "UVR-MDX-NET-Inst_1",
+                     device: Optional[str] = None,
+                     segment_size: int = 256):
+        """
+        Update model with the given model name
+        Args:
+            model_name (str): Model name.
+            device (str): Device to use for the model.
+            segment_size (int): Segment size for the prediction.
+        """
+        if device is None:
+            device = self.device
+        self.device = device
+        self.model_config = {
+            "segment": segment_size,
+            "split": True
+        }
+        self.model = MDX(name=model_name,
+                         other_metadata=self.model_config,
+                         device=self.device,
+                         logger=None,
+                         model_dir=self.model_dir)
+    def separate(self,
+                 audio: Union[str, np.ndarray],
+                 model_name: str,
+                 device: Optional[str] = None,
+                 segment_size: int = 256,
+                 save_file: bool = False,
+                 progress: gr.Progress = gr.Progress()) -> tuple[np.ndarray, np.ndarray, List]:
+        """
+        Separate the background music from the audio.
+        Args:
+            audio (Union[str, np.ndarray]): Audio path or numpy array.
+            model_name (str): Model name.
+            device (str): Device to use for the model.
+            segment_size (int): Segment size for the prediction.
+            save_file (bool): Whether to save the separated audio to output path or not.
+            progress (gr.Progress): Gradio progress indicator.
+        Returns:
+            A Tuple of
+            np.ndarray: Instrumental numpy arrays.
+            np.ndarray: Vocals numpy arrays.
+            file_paths: List of file paths where the separated audio is saved. Return empty when save_file is False.
+        """
+        if isinstance(audio, str):
+            output_filename, ext = os.path.basename(audio), ".wav"
+            output_filename, orig_ext = os.path.splitext(output_filename)
+            if is_video(audio):
+                audio = load_audio(audio)
+                sample_rate = 16000
+            else:
+                self.audio_info = torchaudio.info(audio)
+                sample_rate = self.audio_info.sample_rate
+        else:
+            timestamp = datetime.now().strftime("%m%d%H%M%S")
+            output_filename, ext = f"UVR-{timestamp}", ".wav"
+            sample_rate = 16000
+        model_config = {
+            "segment": segment_size,
+            "split": True
+        }
+        if (self.model is None or
+                self.current_model_size != model_name or
+                self.model_config != model_config or
+                self.model.sample_rate != sample_rate or
+                self.device != device):
+            progress(0, desc="Initializing UVR Model..")
+            self.update_model(
+                model_name=model_name,
+                device=device,
+                segment_size=segment_size
+            )
+            self.model.sample_rate = sample_rate
+        progress(0, desc="Separating background music from the audio..")
+        result = self.model(audio)
+        instrumental, vocals = result["instrumental"].T, result["vocals"].T
+        file_paths = []
+        if save_file:
+            instrumental_output_path = os.path.join(self.output_dir, "instrumental", f"{output_filename}-instrumental{ext}")
+            vocals_output_path = os.path.join(self.output_dir, "vocals", f"{output_filename}-vocals{ext}")
+            sf.write(instrumental_output_path, instrumental, sample_rate, format="WAV")
+            sf.write(vocals_output_path, vocals, sample_rate, format="WAV")
+            file_paths += [instrumental_output_path, vocals_output_path]
+        return instrumental, vocals, file_paths
+    def separate_files(self,
+                       files: List,
+                       model_name: str,
+                       device: Optional[str] = None,
+                       segment_size: int = 256,
+                       save_file: bool = True,
+                       progress: gr.Progress = gr.Progress()) -> List[str]:
+        """Separate the background music from the audio files. Returns only last Instrumental and vocals file paths
+        to display into gr.Audio()"""
+        self.cache_parameters(model_size=model_name, segment_size=segment_size)
+        for file_path in files:
+            instrumental, vocals, file_paths = self.separate(
+                audio=file_path,
+                model_name=model_name,
+                device=device,
+                segment_size=segment_size,
+                save_file=save_file,
+                progress=progress
+            )
+        return file_paths
+    @staticmethod
+    def get_device():
+        """Get device for the model"""
+        return "cuda" if torch.cuda.is_available() else "cpu"
+    def offload(self):
+        """Offload the model and free up the memory"""
+        if self.model is not None:
+            del self.model
+            self.model = None
+        if self.device == "cuda":
+            torch.cuda.empty_cache()
+        gc.collect()
+        self.audio_info = None
+    @staticmethod
+    def cache_parameters(model_size: str,
+                         segment_size: int):
+        cached_params = load_yaml(DEFAULT_PARAMETERS_CONFIG_PATH)
+        cached_uvr_params = cached_params["bgm_separation"]
+        uvr_params_to_cache = {
+            "model_size": model_size,
+            "segment_size": segment_size
+        }
+        cached_uvr_params = {**cached_uvr_params, **uvr_params_to_cache}
+        cached_params["bgm_separation"] = cached_uvr_params
+        save_yaml(cached_params, DEFAULT_PARAMETERS_CONFIG_PATH)

modules/vad/__init__.py ADDED Viewed

File without changes

modules/vad/silero_vad.py ADDED Viewed

	@@ -0,0 +1,265 @@

+# Adapted from https://github.com/SYSTRAN/faster-whisper/blob/master/faster_whisper/vad.py
+from faster_whisper.vad import VadOptions, get_vad_model
+import numpy as np
+from typing import BinaryIO, Union, List, Optional, Tuple
+import warnings
+import faster_whisper
+from modules.whisper.data_classes import *
+from faster_whisper.transcribe import SpeechTimestampsMap
+import gradio as gr
+class SileroVAD:
+    def __init__(self):
+        self.sampling_rate = 16000
+        self.window_size_samples = 512
+        self.model = None
+    def run(self,
+            audio: Union[str, BinaryIO, np.ndarray],
+            vad_parameters: VadOptions,
+            progress: gr.Progress = gr.Progress()
+            ) -> Tuple[np.ndarray, List[dict]]:
+        """
+        Run VAD
+        Parameters
+        ----------
+        audio: Union[str, BinaryIO, np.ndarray]
+            Audio path or file binary or Audio numpy array
+        vad_parameters:
+            Options for VAD processing.
+        progress: gr.Progress
+            Indicator to show progress directly in gradio.
+        Returns
+        ----------
+        np.ndarray
+            Pre-processed audio with VAD
+        List[dict]
+            Chunks of speeches to be used to restore the timestamps later
+        """
+        sampling_rate = self.sampling_rate
+        if not isinstance(audio, np.ndarray):
+            audio = faster_whisper.decode_audio(audio, sampling_rate=sampling_rate)
+        duration = audio.shape[0] / sampling_rate
+        duration_after_vad = duration
+        if vad_parameters is None:
+            vad_parameters = VadOptions()
+        elif isinstance(vad_parameters, dict):
+            vad_parameters = VadOptions(**vad_parameters)
+        speech_chunks = self.get_speech_timestamps(
+            audio=audio,
+            vad_options=vad_parameters,
+            progress=progress
+        )
+        audio = self.collect_chunks(audio, speech_chunks)
+        duration_after_vad = audio.shape[0] / sampling_rate
+        return audio, speech_chunks
+    def get_speech_timestamps(
+        self,
+        audio: np.ndarray,
+        vad_options: Optional[VadOptions] = None,
+        progress: gr.Progress = gr.Progress(),
+        **kwargs,
+    ) -> List[dict]:
+        """This method is used for splitting long audios into speech chunks using silero VAD.
+        Args:
+          audio: One dimensional float array.
+          vad_options: Options for VAD processing.
+          kwargs: VAD options passed as keyword arguments for backward compatibility.
+          progress: Gradio progress to indicate progress.
+        Returns:
+          List of dicts containing begin and end samples of each speech chunk.
+        """
+        if self.model is None:
+            self.update_model()
+        if vad_options is None:
+            vad_options = VadOptions(**kwargs)
+        threshold = vad_options.threshold
+        min_speech_duration_ms = vad_options.min_speech_duration_ms
+        max_speech_duration_s = vad_options.max_speech_duration_s
+        min_silence_duration_ms = vad_options.min_silence_duration_ms
+        window_size_samples = self.window_size_samples
+        speech_pad_ms = vad_options.speech_pad_ms
+        sampling_rate = 16000
+        min_speech_samples = sampling_rate * min_speech_duration_ms / 1000
+        speech_pad_samples = sampling_rate * speech_pad_ms / 1000
+        max_speech_samples = (
+                sampling_rate * max_speech_duration_s
+                - window_size_samples
+                - 2 * speech_pad_samples
+        )
+        min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
+        min_silence_samples_at_max_speech = sampling_rate * 98 / 1000
+        audio_length_samples = len(audio)
+        state, context = self.model.get_initial_states(batch_size=1)
+        speech_probs = []
+        for current_start_sample in range(0, audio_length_samples, window_size_samples):
+            progress(current_start_sample/audio_length_samples, desc="Detecting speeches only using VAD...")
+            chunk = audio[current_start_sample: current_start_sample + window_size_samples]
+            if len(chunk) < window_size_samples:
+                chunk = np.pad(chunk, (0, int(window_size_samples - len(chunk))))
+            speech_prob, state, context = self.model(chunk, state, context, sampling_rate)
+            speech_probs.append(speech_prob)
+        triggered = False
+        speeches = []
+        current_speech = {}
+        neg_threshold = threshold - 0.15
+        # to save potential segment end (and tolerate some silence)
+        temp_end = 0
+        # to save potential segment limits in case of maximum segment size reached
+        prev_end = next_start = 0
+        for i, speech_prob in enumerate(speech_probs):
+            if (speech_prob >= threshold) and temp_end:
+                temp_end = 0
+                if next_start < prev_end:
+                    next_start = window_size_samples * i
+            if (speech_prob >= threshold) and not triggered:
+                triggered = True
+                current_speech["start"] = window_size_samples * i
+                continue
+            if (
+                    triggered
+                    and (window_size_samples * i) - current_speech["start"] > max_speech_samples
+            ):
+                if prev_end:
+                    current_speech["end"] = prev_end
+                    speeches.append(current_speech)
+                    current_speech = {}
+                    # previously reached silence (< neg_thres) and is still not speech (< thres)
+                    if next_start < prev_end:
+                        triggered = False
+                    else:
+                        current_speech["start"] = next_start
+                    prev_end = next_start = temp_end = 0
+                else:
+                    current_speech["end"] = window_size_samples * i
+                    speeches.append(current_speech)
+                    current_speech = {}
+                    prev_end = next_start = temp_end = 0
+                    triggered = False
+                    continue
+            if (speech_prob < neg_threshold) and triggered:
+                if not temp_end:
+                    temp_end = window_size_samples * i
+                # condition to avoid cutting in very short silence
+                if (window_size_samples * i) - temp_end > min_silence_samples_at_max_speech:
+                    prev_end = temp_end
+                if (window_size_samples * i) - temp_end < min_silence_samples:
+                    continue
+                else:
+                    current_speech["end"] = temp_end
+                    if (
+                            current_speech["end"] - current_speech["start"]
+                    ) > min_speech_samples:
+                        speeches.append(current_speech)
+                    current_speech = {}
+                    prev_end = next_start = temp_end = 0
+                    triggered = False
+                    continue
+        if (
+                current_speech
+                and (audio_length_samples - current_speech["start"]) > min_speech_samples
+        ):
+            current_speech["end"] = audio_length_samples
+            speeches.append(current_speech)
+        for i, speech in enumerate(speeches):
+            if i == 0:
+                speech["start"] = int(max(0, speech["start"] - speech_pad_samples))
+            if i != len(speeches) - 1:
+                silence_duration = speeches[i + 1]["start"] - speech["end"]
+                if silence_duration < 2 * speech_pad_samples:
+                    speech["end"] += int(silence_duration // 2)
+                    speeches[i + 1]["start"] = int(
+                        max(0, speeches[i + 1]["start"] - silence_duration // 2)
+                    )
+                else:
+                    speech["end"] = int(
+                        min(audio_length_samples, speech["end"] + speech_pad_samples)
+                    )
+                    speeches[i + 1]["start"] = int(
+                        max(0, speeches[i + 1]["start"] - speech_pad_samples)
+                    )
+            else:
+                speech["end"] = int(
+                    min(audio_length_samples, speech["end"] + speech_pad_samples)
+                )
+        return speeches
+    def update_model(self):
+        self.model = get_vad_model()
+    @staticmethod
+    def collect_chunks(audio: np.ndarray, chunks: List[dict]) -> np.ndarray:
+        """Collects and concatenates audio chunks."""
+        if not chunks:
+            return np.array([], dtype=np.float32)
+        return np.concatenate([audio[chunk["start"]: chunk["end"]] for chunk in chunks])
+    @staticmethod
+    def format_timestamp(
+        seconds: float,
+        always_include_hours: bool = False,
+        decimal_marker: str = ".",
+    ) -> str:
+        assert seconds >= 0, "non-negative timestamp expected"
+        milliseconds = round(seconds * 1000.0)
+        hours = milliseconds // 3_600_000
+        milliseconds -= hours * 3_600_000
+        minutes = milliseconds // 60_000
+        milliseconds -= minutes * 60_000
+        seconds = milliseconds // 1_000
+        milliseconds -= seconds * 1_000
+        hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else ""
+        return (
+            f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}"
+        )
+    def restore_speech_timestamps(
+        self,
+        segments: List[Segment],
+        speech_chunks: List[dict],
+        sampling_rate: Optional[int] = None,
+    ) -> List[Segment]:
+        if sampling_rate is None:
+            sampling_rate = self.sampling_rate
+        ts_map = SpeechTimestampsMap(speech_chunks, sampling_rate)
+        for segment in segments:
+            segment.start = ts_map.get_original_time(segment.start)
+            segment.end = ts_map.get_original_time(segment.end)
+        return segments

modules/whisper/__init__.py ADDED Viewed

File without changes

modules/whisper/base_transcription_pipeline.py ADDED Viewed

	@@ -0,0 +1,563 @@

+import os
+import whisper
+import ctranslate2
+import gradio as gr
+import torchaudio
+from abc import ABC, abstractmethod
+from typing import BinaryIO, Union, Tuple, List
+import numpy as np
+from datetime import datetime
+from faster_whisper.vad import VadOptions
+from modules.uvr.music_separator import MusicSeparator
+from modules.utils.paths import (WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR, DEFAULT_PARAMETERS_CONFIG_PATH,
+                                 UVR_MODELS_DIR)
+from modules.utils.constants import *
+from modules.utils.subtitle_manager import *
+from modules.utils.youtube_manager import get_ytdata, get_ytaudio
+from modules.utils.files_manager import get_media_files, format_gradio_files, load_yaml, save_yaml, read_file
+from modules.whisper.data_classes import *
+from modules.diarize.diarizer import Diarizer
+from modules.vad.silero_vad import SileroVAD
+class BaseTranscriptionPipeline(ABC):
+    def __init__(self,
+                 model_dir: str = WHISPER_MODELS_DIR,
+                 diarization_model_dir: str = DIARIZATION_MODELS_DIR,
+                 uvr_model_dir: str = UVR_MODELS_DIR,
+                 output_dir: str = OUTPUT_DIR,
+                 ):
+        self.model_dir = model_dir
+        self.output_dir = output_dir
+        os.makedirs(self.output_dir, exist_ok=True)
+        os.makedirs(self.model_dir, exist_ok=True)
+        self.diarizer = Diarizer(
+            model_dir=diarization_model_dir
+        )
+        self.vad = SileroVAD()
+        self.music_separator = MusicSeparator(
+            model_dir=uvr_model_dir,
+            output_dir=os.path.join(output_dir, "UVR")
+        )
+        self.model = None
+        self.current_model_size = None
+        self.available_models = whisper.available_models()
+        self.available_langs = sorted(list(whisper.tokenizer.LANGUAGES.values()))
+        self.translatable_models = ["large", "large-v1", "large-v2", "large-v3"]
+        self.device = self.get_device()
+        self.available_compute_types = self.get_available_compute_type()
+        self.current_compute_type = self.get_compute_type()
+    @abstractmethod
+    def transcribe(self,
+                   audio: Union[str, BinaryIO, np.ndarray],
+                   progress: gr.Progress = gr.Progress(),
+                   *whisper_params,
+                   ):
+        """Inference whisper model to transcribe"""
+        pass
+    @abstractmethod
+    def update_model(self,
+                     model_size: str,
+                     compute_type: str,
+                     progress: gr.Progress = gr.Progress()
+                     ):
+        """Initialize whisper model"""
+        pass
+    def run(self,
+            audio: Union[str, BinaryIO, np.ndarray],
+            progress: gr.Progress = gr.Progress(),
+            file_format: str = "SRT",
+            add_timestamp: bool = True,
+            *pipeline_params,
+            ) -> Tuple[List[Segment], float]:
+        """
+        Run transcription with conditional pre-processing and post-processing.
+        The VAD will be performed to remove noise from the audio input in pre-processing, if enabled.
+        The diarization will be performed in post-processing, if enabled.
+        Due to the integration with gradio, the parameters have to be specified with a `*` wildcard.
+        Parameters
+        ----------
+        audio: Union[str, BinaryIO, np.ndarray]
+            Audio input. This can be file path or binary type.
+        progress: gr.Progress
+            Indicator to show progress directly in gradio.
+        file_format: str
+            Subtitle file format between ["SRT", "WebVTT", "txt", "lrc"]
+        add_timestamp: bool
+            Whether to add a timestamp at the end of the filename.
+        *pipeline_params: tuple
+            Parameters for the transcription pipeline. This will be dealt with "TranscriptionPipelineParams" data class.
+            This must be provided as a List with * wildcard because of the integration with gradio.
+            See more info at : https://github.com/gradio-app/gradio/issues/2471
+        Returns
+        ----------
+        segments_result: List[Segment]
+            list of Segment that includes start, end timestamps and transcribed text
+        elapsed_time: float
+            elapsed time for running
+        """
+        params = TranscriptionPipelineParams.from_list(list(pipeline_params))
+        params = self.validate_gradio_values(params)
+        bgm_params, vad_params, whisper_params, diarization_params = params.bgm_separation, params.vad, params.whisper, params.diarization
+        if bgm_params.is_separate_bgm:
+            music, audio, _ = self.music_separator.separate(
+                audio=audio,
+                model_name=bgm_params.model_size,
+                device=bgm_params.device,
+                segment_size=bgm_params.segment_size,
+                save_file=bgm_params.save_file,
+                progress=progress
+            )
+            if audio.ndim >= 2:
+                audio = audio.mean(axis=1)
+                if self.music_separator.audio_info is None:
+                    origin_sample_rate = 16000
+                else:
+                    origin_sample_rate = self.music_separator.audio_info.sample_rate
+                audio = self.resample_audio(audio=audio, original_sample_rate=origin_sample_rate)
+            if bgm_params.enable_offload:
+                self.music_separator.offload()
+        if vad_params.vad_filter:
+            vad_options = VadOptions(
+                threshold=vad_params.threshold,
+                min_speech_duration_ms=vad_params.min_speech_duration_ms,
+                max_speech_duration_s=vad_params.max_speech_duration_s,
+                min_silence_duration_ms=vad_params.min_silence_duration_ms,
+                speech_pad_ms=vad_params.speech_pad_ms
+            )
+            vad_processed, speech_chunks = self.vad.run(
+                audio=audio,
+                vad_parameters=vad_options,
+                progress=progress
+            )
+            if vad_processed.size > 0:
+                audio = vad_processed
+            else:
+                vad_params.vad_filter = False
+        result, elapsed_time = self.transcribe(
+            audio,
+            progress,
+            *whisper_params.to_list()
+        )
+        if vad_params.vad_filter:
+            result = self.vad.restore_speech_timestamps(
+                segments=result,
+                speech_chunks=speech_chunks,
+            )
+        if diarization_params.is_diarize:
+            result, elapsed_time_diarization = self.diarizer.run(
+                audio=audio,
+                use_auth_token=diarization_params.hf_token,
+                transcribed_result=result,
+                device=diarization_params.device
+            )
+            elapsed_time += elapsed_time_diarization
+        self.cache_parameters(
+            params=params,
+            file_format=file_format,
+            add_timestamp=add_timestamp
+        )
+        return result, elapsed_time
+    def transcribe_file(self,
+                        files: Optional[List] = None,
+                        input_folder_path: Optional[str] = None,
+                        file_format: str = "SRT",
+                        add_timestamp: bool = True,
+                        progress=gr.Progress(),
+                        *pipeline_params,
+                        ) -> Tuple[str, List]:
+        """
+        Write subtitle file from Files
+        Parameters
+        ----------
+        files: list
+            List of files to transcribe from gr.Files()
+        input_folder_path: str
+            Input folder path to transcribe from gr.Textbox(). If this is provided, `files` will be ignored and
+            this will be used instead.
+        file_format: str
+            Subtitle File format to write from gr.Dropdown(). Supported format: [SRT, WebVTT, txt]
+        add_timestamp: bool
+            Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the subtitle filename.
+        progress: gr.Progress
+            Indicator to show progress directly in gradio.
+        *pipeline_params: tuple
+            Parameters for the transcription pipeline. This will be dealt with "TranscriptionPipelineParams" data class
+        Returns
+        ----------
+        result_str:
+            Result of transcription to return to gr.Textbox()
+        result_file_path:
+            Output file path to return to gr.Files()
+        """
+        try:
+            params = TranscriptionPipelineParams.from_list(list(pipeline_params))
+            writer_options = {
+                "highlight_words": True if params.whisper.word_timestamps else False
+            }
+            if input_folder_path:
+                files = get_media_files(input_folder_path)
+            if isinstance(files, str):
+                files = [files]
+            if files and isinstance(files[0], gr.utils.NamedString):
+                files = [file.name for file in files]
+            files_info = {}
+            for file in files:
+                transcribed_segments, time_for_task = self.run(
+                    file,
+                    progress,
+                    file_format,
+                    add_timestamp,
+                    *pipeline_params,
+                )
+                file_name, file_ext = os.path.splitext(os.path.basename(file))
+                subtitle, file_path = generate_file(
+                    output_dir=self.output_dir,
+                    output_file_name=file_name,
+                    output_format=file_format,
+                    result=transcribed_segments,
+                    add_timestamp=add_timestamp,
+                    **writer_options
+                )
+                files_info[file_name] = {"subtitle": read_file(file_path), "time_for_task": time_for_task, "path": file_path}
+            total_result = ''
+            total_time = 0
+            for file_name, info in files_info.items():
+                total_result += '------------------------------------\n'
+                total_result += f'{file_name}\n\n'
+                total_result += f'{info["subtitle"]}'
+                total_time += info["time_for_task"]
+            result_str = f"Done in {self.format_time(total_time)}! Subtitle is in the outputs folder.\n\n{total_result}"
+            result_file_path = [info['path'] for info in files_info.values()]
+            return result_str, result_file_path
+        except Exception as e:
+            print(f"Error transcribing file: {e}")
+            raise
+        finally:
+            self.release_cuda_memory()
+    def transcribe_mic(self,
+                       mic_audio: str,
+                       file_format: str = "SRT",
+                       add_timestamp: bool = True,
+                       progress=gr.Progress(),
+                       *pipeline_params,
+                       ) -> Tuple[str, str]:
+        """
+        Write subtitle file from microphone
+        Parameters
+        ----------
+        mic_audio: str
+            Audio file path from gr.Microphone()
+        file_format: str
+            Subtitle File format to write from gr.Dropdown(). Supported format: [SRT, WebVTT, txt]
+        add_timestamp: bool
+            Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename.
+        progress: gr.Progress
+            Indicator to show progress directly in gradio.
+        *pipeline_params: tuple
+            Parameters related with whisper. This will be dealt with "WhisperParameters" data class
+        Returns
+        ----------
+        result_str:
+            Result of transcription to return to gr.Textbox()
+        result_file_path:
+            Output file path to return to gr.Files()
+        """
+        try:
+            params = TranscriptionPipelineParams.from_list(list(pipeline_params))
+            writer_options = {
+                "highlight_words": True if params.whisper.word_timestamps else False
+            }
+            progress(0, desc="Loading Audio..")
+            transcribed_segments, time_for_task = self.run(
+                mic_audio,
+                progress,
+                file_format,
+                add_timestamp,
+                *pipeline_params,
+            )
+            progress(1, desc="Completed!")
+            file_name = "Mic"
+            subtitle, file_path = generate_file(
+                output_dir=self.output_dir,
+                output_file_name=file_name,
+                output_format=file_format,
+                result=transcribed_segments,
+                add_timestamp=add_timestamp,
+                **writer_options
+            )
+            result_str = f"Done in {self.format_time(time_for_task)}! Subtitle file is in the outputs folder.\n\n{subtitle}"
+            return result_str, file_path
+        except Exception as e:
+            print(f"Error transcribing mic: {e}")
+            raise
+        finally:
+            self.release_cuda_memory()
+    def transcribe_youtube(self,
+                           youtube_link: str,
+                           file_format: str = "SRT",
+                           add_timestamp: bool = True,
+                           progress=gr.Progress(),
+                           *pipeline_params,
+                           ) -> Tuple[str, str]:
+        """
+        Write subtitle file from Youtube
+        Parameters
+        ----------
+        youtube_link: str
+            URL of the Youtube video to transcribe from gr.Textbox()
+        file_format: str
+            Subtitle File format to write from gr.Dropdown(). Supported format: [SRT, WebVTT, txt]
+        add_timestamp: bool
+            Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename.
+        progress: gr.Progress
+            Indicator to show progress directly in gradio.
+        *pipeline_params: tuple
+            Parameters related with whisper. This will be dealt with "WhisperParameters" data class
+        Returns
+        ----------
+        result_str:
+            Result of transcription to return to gr.Textbox()
+        result_file_path:
+            Output file path to return to gr.Files()
+        """
+        try:
+            params = TranscriptionPipelineParams.from_list(list(pipeline_params))
+            writer_options = {
+                "highlight_words": True if params.whisper.word_timestamps else False
+            }
+            progress(0, desc="Loading Audio from Youtube..")
+            yt = get_ytdata(youtube_link)
+            audio = get_ytaudio(yt)
+            transcribed_segments, time_for_task = self.run(
+                audio,
+                progress,
+                file_format,
+                add_timestamp,
+                *pipeline_params,
+            )
+            progress(1, desc="Completed!")
+            file_name = safe_filename(yt.title)
+            subtitle, file_path = generate_file(
+                output_dir=self.output_dir,
+                output_file_name=file_name,
+                output_format=file_format,
+                result=transcribed_segments,
+                add_timestamp=add_timestamp,
+                **writer_options
+            )
+            result_str = f"Done in {self.format_time(time_for_task)}! Subtitle file is in the outputs folder.\n\n{subtitle}"
+            if os.path.exists(audio):
+                os.remove(audio)
+            return result_str, file_path
+        except Exception as e:
+            print(f"Error transcribing youtube: {e}")
+            raise
+        finally:
+            self.release_cuda_memory()
+    def get_compute_type(self):
+        if "float16" in self.available_compute_types:
+            return "float16"
+        if "float32" in self.available_compute_types:
+            return "float32"
+        else:
+            return self.available_compute_types[0]
+    def get_available_compute_type(self):
+        if self.device == "cuda":
+            return list(ctranslate2.get_supported_compute_types("cuda"))
+        else:
+            return list(ctranslate2.get_supported_compute_types("cpu"))
+    @staticmethod
+    def format_time(elapsed_time: float) -> str:
+        """
+        Get {hours} {minutes} {seconds} time format string
+        Parameters
+        ----------
+        elapsed_time: str
+            Elapsed time for transcription
+        Returns
+        ----------
+        Time format string
+        """
+        hours, rem = divmod(elapsed_time, 3600)
+        minutes, seconds = divmod(rem, 60)
+        time_str = ""
+        if hours:
+            time_str += f"{hours} hours "
+        if minutes:
+            time_str += f"{minutes} minutes "
+        seconds = round(seconds)
+        time_str += f"{seconds} seconds"
+        return time_str.strip()
+    @staticmethod
+    def get_device():
+        if torch.cuda.is_available():
+            return "cuda"
+        elif torch.backends.mps.is_available():
+            if not BaseTranscriptionPipeline.is_sparse_api_supported():
+                # Device `SparseMPS` is not supported for now. See : https://github.com/pytorch/pytorch/issues/87886
+                return "cpu"
+            return "mps"
+        else:
+            return "cpu"
+    @staticmethod
+    def is_sparse_api_supported():
+        if not torch.backends.mps.is_available():
+            return False
+        try:
+            device = torch.device("mps")
+            sparse_tensor = torch.sparse_coo_tensor(
+                indices=torch.tensor([[0, 1], [2, 3]]),
+                values=torch.tensor([1, 2]),
+                size=(4, 4),
+                device=device
+            )
+            return True
+        except RuntimeError:
+            return False
+    @staticmethod
+    def release_cuda_memory():
+        """Release memory"""
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            torch.cuda.reset_max_memory_allocated()
+    @staticmethod
+    def remove_input_files(file_paths: List[str]):
+        """Remove gradio cached files"""
+        if not file_paths:
+            return
+        for file_path in file_paths:
+            if file_path and os.path.exists(file_path):
+                os.remove(file_path)
+    @staticmethod
+    def validate_gradio_values(params: TranscriptionPipelineParams):
+        """
+        Validate gradio specific values that can't be displayed as None in the UI.
+        Related issue : https://github.com/gradio-app/gradio/issues/8723
+        """
+        if params.whisper.lang is None:
+            pass
+        elif params.whisper.lang == AUTOMATIC_DETECTION:
+            params.whisper.lang = None
+        else:
+            language_code_dict = {value: key for key, value in whisper.tokenizer.LANGUAGES.items()}
+            params.whisper.lang = language_code_dict[params.whisper.lang]
+        if params.whisper.initial_prompt == GRADIO_NONE_STR:
+            params.whisper.initial_prompt = None
+        if params.whisper.prefix == GRADIO_NONE_STR:
+            params.whisper.prefix = None
+        if params.whisper.hotwords == GRADIO_NONE_STR:
+            params.whisper.hotwords = None
+        if params.whisper.max_new_tokens == GRADIO_NONE_NUMBER_MIN:
+            params.whisper.max_new_tokens = None
+        if params.whisper.hallucination_silence_threshold == GRADIO_NONE_NUMBER_MIN:
+            params.whisper.hallucination_silence_threshold = None
+        if params.whisper.language_detection_threshold == GRADIO_NONE_NUMBER_MIN:
+            params.whisper.language_detection_threshold = None
+        if params.vad.max_speech_duration_s == GRADIO_NONE_NUMBER_MAX:
+            params.vad.max_speech_duration_s = float('inf')
+        return params
+    @staticmethod
+    def cache_parameters(
+        params: TranscriptionPipelineParams,
+        file_format: str = "SRT",
+        add_timestamp: bool = True
+    ):
+        """Cache parameters to the yaml file"""
+        cached_params = load_yaml(DEFAULT_PARAMETERS_CONFIG_PATH)
+        param_to_cache = params.to_dict()
+        cached_yaml = {**cached_params, **param_to_cache}
+        cached_yaml["whisper"]["add_timestamp"] = add_timestamp
+        cached_yaml["whisper"]["file_format"] = file_format
+        supress_token = cached_yaml["whisper"].get("suppress_tokens", None)
+        if supress_token and isinstance(supress_token, list):
+            cached_yaml["whisper"]["suppress_tokens"] = str(supress_token)
+        if cached_yaml["whisper"].get("lang", None) is None:
+            cached_yaml["whisper"]["lang"] = AUTOMATIC_DETECTION.unwrap()
+        else:
+            language_dict = whisper.tokenizer.LANGUAGES
+            cached_yaml["whisper"]["lang"] = language_dict[cached_yaml["whisper"]["lang"]]
+        if cached_yaml["vad"].get("max_speech_duration_s", float('inf')) == float('inf'):
+            cached_yaml["vad"]["max_speech_duration_s"] = GRADIO_NONE_NUMBER_MAX
+        if cached_yaml is not None and cached_yaml:
+            save_yaml(cached_yaml, DEFAULT_PARAMETERS_CONFIG_PATH)
+    @staticmethod
+    def resample_audio(audio: Union[str, np.ndarray],
+                       new_sample_rate: int = 16000,
+                       original_sample_rate: Optional[int] = None,) -> np.ndarray:
+        """Resamples audio to 16k sample rate, standard on Whisper model"""
+        if isinstance(audio, str):
+            audio, original_sample_rate = torchaudio.load(audio)
+        else:
+            if original_sample_rate is None:
+                raise ValueError("original_sample_rate must be provided when audio is numpy array.")
+            audio = torch.from_numpy(audio)
+        resampler = torchaudio.transforms.Resample(orig_freq=original_sample_rate, new_freq=new_sample_rate)
+        resampled_audio = resampler(audio).numpy()
+        return resampled_audio

modules/whisper/data_classes.py ADDED Viewed

	@@ -0,0 +1,608 @@

+import faster_whisper.transcribe
+import gradio as gr
+import torch
+from typing import Optional, Dict, List, Union, NamedTuple
+from pydantic import BaseModel, Field, field_validator, ConfigDict
+from gradio_i18n import Translate, gettext as _
+from enum import Enum
+from copy import deepcopy
+import yaml
+from modules.utils.constants import *
+class WhisperImpl(Enum):
+    WHISPER = "whisper"
+    FASTER_WHISPER = "faster-whisper"
+    INSANELY_FAST_WHISPER = "insanely_fast_whisper"
+class Segment(BaseModel):
+    id: Optional[int] = Field(default=None, description="Incremental id for the segment")
+    seek: Optional[int] = Field(default=None, description="Seek of the segment from chunked audio")
+    text: Optional[str] = Field(default=None, description="Transcription text of the segment")
+    start: Optional[float] = Field(default=None, description="Start time of the segment")
+    end: Optional[float] = Field(default=None, description="End time of the segment")
+    tokens: Optional[List[int]] = Field(default=None, description="List of token IDs")
+    temperature: Optional[float] = Field(default=None, description="Temperature used during the decoding process")
+    avg_logprob: Optional[float] = Field(default=None, description="Average log probability of the tokens")
+    compression_ratio: Optional[float] = Field(default=None, description="Compression ratio of the segment")
+    no_speech_prob: Optional[float] = Field(default=None, description="Probability that it's not speech")
+    words: Optional[List['Word']] = Field(default=None, description="List of words contained in the segment")
+    @classmethod
+    def from_faster_whisper(cls,
+                            seg: faster_whisper.transcribe.Segment):
+        if seg.words is not None:
+            words = [
+                Word(
+                    start=w.start,
+                    end=w.end,
+                    word=w.word,
+                    probability=w.probability
+                ) for w in seg.words
+            ]
+        else:
+            words = None
+        return cls(
+            id=seg.id,
+            seek=seg.seek,
+            text=seg.text,
+            start=seg.start,
+            end=seg.end,
+            tokens=seg.tokens,
+            temperature=seg.temperature,
+            avg_logprob=seg.avg_logprob,
+            compression_ratio=seg.compression_ratio,
+            no_speech_prob=seg.no_speech_prob,
+            words=words
+        )
+class Word(BaseModel):
+    start: Optional[float] = Field(default=None, description="Start time of the word")
+    end: Optional[float] = Field(default=None, description="Start time of the word")
+    word: Optional[str] = Field(default=None, description="Word text")
+    probability: Optional[float] = Field(default=None, description="Probability of the word")
+class BaseParams(BaseModel):
+    model_config = ConfigDict(protected_namespaces=())
+    def to_dict(self) -> Dict:
+        return self.model_dump()
+    def to_list(self) -> List:
+        return list(self.model_dump().values())
+    @classmethod
+    def from_list(cls, data_list: List) -> 'BaseParams':
+        field_names = list(cls.model_fields.keys())
+        return cls(**dict(zip(field_names, data_list)))
+class VadParams(BaseParams):
+    """Voice Activity Detection parameters"""
+    vad_filter: bool = Field(default=False, description="Enable voice activity detection to filter out non-speech parts")
+    threshold: float = Field(
+        default=0.5,
+        ge=0.0,
+        le=1.0,
+        description="Speech threshold for Silero VAD. Probabilities above this value are considered speech"
+    )
+    min_speech_duration_ms: int = Field(
+        default=250,
+        ge=0,
+        description="Final speech chunks shorter than this are discarded"
+    )
+    max_speech_duration_s: float = Field(
+        default=float("inf"),
+        gt=0,
+        description="Maximum duration of speech chunks in seconds"
+    )
+    min_silence_duration_ms: int = Field(
+        default=2000,
+        ge=0,
+        description="Minimum silence duration between speech chunks"
+    )
+    speech_pad_ms: int = Field(
+        default=400,
+        ge=0,
+        description="Padding added to each side of speech chunks"
+    )
+    @classmethod
+    def to_gradio_inputs(cls, defaults: Optional[Dict] = None) -> List[gr.components.base.FormComponent]:
+        return [
+            gr.Checkbox(
+                label=_("Enable Silero VAD Filter"),
+                value=defaults.get("vad_filter", cls.__fields__["vad_filter"].default),
+                interactive=True,
+                info=_("Enable this to transcribe only detected voice")
+            ),
+            gr.Slider(
+                minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold",
+                value=defaults.get("threshold", cls.__fields__["threshold"].default),
+                info="Lower it to be more sensitive to small sounds."
+            ),
+            gr.Number(
+                label="Minimum Speech Duration (ms)", precision=0,
+                value=defaults.get("min_speech_duration_ms", cls.__fields__["min_speech_duration_ms"].default),
+                info="Final speech chunks shorter than this time are thrown out"
+            ),
+            gr.Number(
+                label="Maximum Speech Duration (s)",
+                value=defaults.get("max_speech_duration_s", GRADIO_NONE_NUMBER_MAX),
+                info="Maximum duration of speech chunks in \"seconds\"."
+            ),
+            gr.Number(
+                label="Minimum Silence Duration (ms)", precision=0,
+                value=defaults.get("min_silence_duration_ms", cls.__fields__["min_silence_duration_ms"].default),
+                info="In the end of each speech chunk wait for this time before separating it"
+            ),
+            gr.Number(
+                label="Speech Padding (ms)", precision=0,
+                value=defaults.get("speech_pad_ms", cls.__fields__["speech_pad_ms"].default),
+                info="Final speech chunks are padded by this time each side"
+            )
+        ]
+class DiarizationParams(BaseParams):
+    """Speaker diarization parameters"""
+    is_diarize: bool = Field(default=False, description="Enable speaker diarization")
+    device: str = Field(default="cuda", description="Device to run Diarization model.")
+    hf_token: str = Field(
+        default="",
+        description="Hugging Face token for downloading diarization models"
+    )
+    @classmethod
+    def to_gradio_inputs(cls,
+                         defaults: Optional[Dict] = None,
+                         available_devices: Optional[List] = None,
+                         device: Optional[str] = None) -> List[gr.components.base.FormComponent]:
+        return [
+            gr.Checkbox(
+                label=_("Enable Diarization"),
+                value=defaults.get("is_diarize", cls.__fields__["is_diarize"].default),
+            ),
+            gr.Dropdown(
+                label=_("Device"),
+                choices=["cpu", "cuda"] if available_devices is None else available_devices,
+                value=defaults.get("device", device),
+            ),
+            gr.Textbox(
+                label=_("HuggingFace Token"),
+                value=defaults.get("hf_token", cls.__fields__["hf_token"].default),
+                info=_("This is only needed the first time you download the model")
+            ),
+        ]
+class BGMSeparationParams(BaseParams):
+    """Background music separation parameters"""
+    is_separate_bgm: bool = Field(default=False, description="Enable background music separation")
+    model_size: str = Field(
+        default="UVR-MDX-NET-Inst_HQ_4",
+        description="UVR model size"
+    )
+    device: str = Field(default="cuda", description="Device to run UVR model.")
+    segment_size: int = Field(
+        default=256,
+        gt=0,
+        description="Segment size for UVR model"
+    )
+    save_file: bool = Field(
+        default=False,
+        description="Whether to save separated audio files"
+    )
+    enable_offload: bool = Field(
+        default=True,
+        description="Offload UVR model after transcription"
+    )
+    @classmethod
+    def to_gradio_input(cls,
+                        defaults: Optional[Dict] = None,
+                        available_devices: Optional[List] = None,
+                        device: Optional[str] = None,
+                        available_models: Optional[List] = None) -> List[gr.components.base.FormComponent]:
+        return [
+            gr.Checkbox(
+                label=_("Enable Background Music Remover Filter"),
+                value=defaults.get("is_separate_bgm", cls.__fields__["is_separate_bgm"].default),
+                interactive=True,
+                info=_("Enabling this will remove background music")
+            ),
+            gr.Dropdown(
+                label=_("Model"),
+                choices=["UVR-MDX-NET-Inst_HQ_4",
+                         "UVR-MDX-NET-Inst_3"] if available_models is None else available_models,
+                value=defaults.get("model_size", cls.__fields__["model_size"].default),
+            ),
+            gr.Dropdown(
+                label=_("Device"),
+                choices=["cpu", "cuda"] if available_devices is None else available_devices,
+                value=defaults.get("device", device),
+            ),
+            gr.Number(
+                label="Segment Size",
+                value=defaults.get("segment_size", cls.__fields__["segment_size"].default),
+                precision=0,
+                info="Segment size for UVR model"
+            ),
+            gr.Checkbox(
+                label=_("Save separated files to output"),
+                value=defaults.get("save_file", cls.__fields__["save_file"].default),
+            ),
+            gr.Checkbox(
+                label=_("Offload sub model after removing background music"),
+                value=defaults.get("enable_offload", cls.__fields__["enable_offload"].default),
+            )
+        ]
+class WhisperParams(BaseParams):
+    """Whisper parameters"""
+    model_size: str = Field(default="large-v2", description="Whisper model size")
+    lang: Optional[str] = Field(default=None, description="Source language of the file to transcribe")
+    is_translate: bool = Field(default=False, description="Translate speech to English end-to-end")
+    beam_size: int = Field(default=5, ge=1, description="Beam size for decoding")
+    log_prob_threshold: float = Field(
+        default=-1.0,
+        description="Threshold for average log probability of sampled tokens"
+    )
+    no_speech_threshold: float = Field(
+        default=0.6,
+        ge=0.0,
+        le=1.0,
+        description="Threshold for detecting silence"
+    )
+    compute_type: str = Field(default="float16", description="Computation type for transcription")
+    best_of: int = Field(default=5, ge=1, description="Number of candidates when sampling")
+    patience: float = Field(default=1.0, gt=0, description="Beam search patience factor")
+    condition_on_previous_text: bool = Field(
+        default=True,
+        description="Use previous output as prompt for next window"
+    )
+    prompt_reset_on_temperature: float = Field(
+        default=0.5,
+        ge=0.0,
+        le=1.0,
+        description="Temperature threshold for resetting prompt"
+    )
+    initial_prompt: Optional[str] = Field(default=None, description="Initial prompt for first window")
+    temperature: float = Field(
+        default=0.0,
+        ge=0.0,
+        description="Temperature for sampling"
+    )
+    compression_ratio_threshold: float = Field(
+        default=2.4,
+        gt=0,
+        description="Threshold for gzip compression ratio"
+    )
+    length_penalty: float = Field(default=1.0, gt=0, description="Exponential length penalty")
+    repetition_penalty: float = Field(default=1.0, gt=0, description="Penalty for repeated tokens")
+    no_repeat_ngram_size: int = Field(default=0, ge=0, description="Size of n-grams to prevent repetition")
+    prefix: Optional[str] = Field(default=None, description="Prefix text for first window")
+    suppress_blank: bool = Field(
+        default=True,
+        description="Suppress blank outputs at start of sampling"
+    )
+    suppress_tokens: Optional[Union[List[int], str]] = Field(default=[-1], description="Token IDs to suppress")
+    max_initial_timestamp: float = Field(
+        default=1.0,
+        ge=0.0,
+        description="Maximum initial timestamp"
+    )
+    word_timestamps: bool = Field(default=False, description="Extract word-level timestamps")
+    prepend_punctuations: Optional[str] = Field(
+        default="\"'“¿([{-",
+        description="Punctuations to merge with next word"
+    )
+    append_punctuations: Optional[str] = Field(
+        default="\"'.。,，!！?？:：”)]}、",
+        description="Punctuations to merge with previous word"
+    )
+    max_new_tokens: Optional[int] = Field(default=None, description="Maximum number of new tokens per chunk")
+    chunk_length: Optional[int] = Field(default=30, description="Length of audio segments in seconds")
+    hallucination_silence_threshold: Optional[float] = Field(
+        default=None,
+        description="Threshold for skipping silent periods in hallucination detection"
+    )
+    hotwords: Optional[str] = Field(default=None, description="Hotwords/hint phrases for the model")
+    language_detection_threshold: Optional[float] = Field(
+        default=None,
+        description="Threshold for language detection probability"
+    )
+    language_detection_segments: int = Field(
+        default=1,
+        gt=0,
+        description="Number of segments for language detection"
+    )
+    batch_size: int = Field(default=24, gt=0, description="Batch size for processing")
+    @field_validator('lang')
+    def validate_lang(cls, v):
+        from modules.utils.constants import AUTOMATIC_DETECTION
+        return None if v == AUTOMATIC_DETECTION.unwrap() else v
+    @field_validator('suppress_tokens')
+    def validate_supress_tokens(cls, v):
+        import ast
+        try:
+            if isinstance(v, str):
+                suppress_tokens = ast.literal_eval(v)
+                if not isinstance(suppress_tokens, list):
+                    raise ValueError("Invalid Suppress Tokens. The value must be type of List[int]")
+                return suppress_tokens
+            if isinstance(v, list):
+                return v
+        except Exception as e:
+            raise ValueError(f"Invalid Suppress Tokens. The value must be type of List[int]: {e}")
+    @classmethod
+    def to_gradio_inputs(cls,
+                         defaults: Optional[Dict] = None,
+                         only_advanced: Optional[bool] = True,
+                         whisper_type: Optional[str] = None,
+                         available_models: Optional[List] = None,
+                         available_langs: Optional[List] = None,
+                         available_compute_types: Optional[List] = None,
+                         compute_type: Optional[str] = None):
+        whisper_type = WhisperImpl.FASTER_WHISPER.value if whisper_type is None else whisper_type.strip().lower()
+        inputs = []
+        if not only_advanced:
+            inputs += [
+                gr.Dropdown(
+                    label=_("Model"),
+                    choices=available_models,
+                    value=defaults.get("model_size", cls.__fields__["model_size"].default),
+                ),
+                gr.Dropdown(
+                    label=_("Language"),
+                    choices=available_langs,
+                    value=defaults.get("lang", AUTOMATIC_DETECTION),
+                ),
+                gr.Checkbox(
+                    label=_("Translate to English?"),
+                    value=defaults.get("is_translate", cls.__fields__["is_translate"].default),
+                ),
+            ]
+        inputs += [
+            gr.Number(
+                label="Beam Size",
+                value=defaults.get("beam_size", cls.__fields__["beam_size"].default),
+                precision=0,
+                info="Beam size for decoding"
+            ),
+            gr.Number(
+                label="Log Probability Threshold",
+                value=defaults.get("log_prob_threshold", cls.__fields__["log_prob_threshold"].default),
+                info="Threshold for average log probability of sampled tokens"
+            ),
+            gr.Number(
+                label="No Speech Threshold",
+                value=defaults.get("no_speech_threshold", cls.__fields__["no_speech_threshold"].default),
+                info="Threshold for detecting silence"
+            ),
+            gr.Dropdown(
+                label="Compute Type",
+                choices=["float16", "int8", "int16"] if available_compute_types is None else available_compute_types,
+                value=defaults.get("compute_type", compute_type),
+                info="Computation type for transcription"
+            ),
+            gr.Number(
+                label="Best Of",
+                value=defaults.get("best_of", cls.__fields__["best_of"].default),
+                precision=0,
+                info="Number of candidates when sampling"
+            ),
+            gr.Number(
+                label="Patience",
+                value=defaults.get("patience", cls.__fields__["patience"].default),
+                info="Beam search patience factor"
+            ),
+            gr.Checkbox(
+                label="Condition On Previous Text",
+                value=defaults.get("condition_on_previous_text", cls.__fields__["condition_on_previous_text"].default),
+                info="Use previous output as prompt for next window"
+            ),
+            gr.Slider(
+                label="Prompt Reset On Temperature",
+                value=defaults.get("prompt_reset_on_temperature",
+                                   cls.__fields__["prompt_reset_on_temperature"].default),
+                minimum=0,
+                maximum=1,
+                step=0.01,
+                info="Temperature threshold for resetting prompt"
+            ),
+            gr.Textbox(
+                label="Initial Prompt",
+                value=defaults.get("initial_prompt", GRADIO_NONE_STR),
+                info="Initial prompt for first window"
+            ),
+            gr.Slider(
+                label="Temperature",
+                value=defaults.get("temperature", cls.__fields__["temperature"].default),
+                minimum=0.0,
+                step=0.01,
+                maximum=1.0,
+                info="Temperature for sampling"
+            ),
+            gr.Number(
+                label="Compression Ratio Threshold",
+                value=defaults.get("compression_ratio_threshold",
+                                   cls.__fields__["compression_ratio_threshold"].default),
+                info="Threshold for gzip compression ratio"
+            )
+        ]
+        faster_whisper_inputs = [
+            gr.Number(
+                label="Length Penalty",
+                value=defaults.get("length_penalty", cls.__fields__["length_penalty"].default),
+                info="Exponential length penalty",
+            ),
+            gr.Number(
+                label="Repetition Penalty",
+                value=defaults.get("repetition_penalty", cls.__fields__["repetition_penalty"].default),
+                info="Penalty for repeated tokens"
+            ),
+            gr.Number(
+                label="No Repeat N-gram Size",
+                value=defaults.get("no_repeat_ngram_size", cls.__fields__["no_repeat_ngram_size"].default),
+                precision=0,
+                info="Size of n-grams to prevent repetition"
+            ),
+            gr.Textbox(
+                label="Prefix",
+                value=defaults.get("prefix", GRADIO_NONE_STR),
+                info="Prefix text for first window"
+            ),
+            gr.Checkbox(
+                label="Suppress Blank",
+                value=defaults.get("suppress_blank", cls.__fields__["suppress_blank"].default),
+                info="Suppress blank outputs at start of sampling"
+            ),
+            gr.Textbox(
+                label="Suppress Tokens",
+                value=defaults.get("suppress_tokens", "[-1]"),
+                info="Token IDs to suppress"
+            ),
+            gr.Number(
+                label="Max Initial Timestamp",
+                value=defaults.get("max_initial_timestamp", cls.__fields__["max_initial_timestamp"].default),
+                info="Maximum initial timestamp"
+            ),
+            gr.Checkbox(
+                label="Word Timestamps",
+                value=defaults.get("word_timestamps", cls.__fields__["word_timestamps"].default),
+                info="Extract word-level timestamps"
+            ),
+            gr.Textbox(
+                label="Prepend Punctuations",
+                value=defaults.get("prepend_punctuations", cls.__fields__["prepend_punctuations"].default),
+                info="Punctuations to merge with next word"
+            ),
+            gr.Textbox(
+                label="Append Punctuations",
+                value=defaults.get("append_punctuations", cls.__fields__["append_punctuations"].default),
+                info="Punctuations to merge with previous word"
+            ),
+            gr.Number(
+                label="Max New Tokens",
+                value=defaults.get("max_new_tokens", GRADIO_NONE_NUMBER_MIN),
+                precision=0,
+                info="Maximum number of new tokens per chunk"
+            ),
+            gr.Number(
+                label="Chunk Length (s)",
+                value=defaults.get("chunk_length", cls.__fields__["chunk_length"].default),
+                precision=0,
+                info="Length of audio segments in seconds"
+            ),
+            gr.Number(
+                label="Hallucination Silence Threshold (sec)",
+                value=defaults.get("hallucination_silence_threshold",
+                                   GRADIO_NONE_NUMBER_MIN),
+                info="Threshold for skipping silent periods in hallucination detection"
+            ),
+            gr.Textbox(
+                label="Hotwords",
+                value=defaults.get("hotwords", cls.__fields__["hotwords"].default),
+                info="Hotwords/hint phrases for the model"
+            ),
+            gr.Number(
+                label="Language Detection Threshold",
+                value=defaults.get("language_detection_threshold",
+                                   GRADIO_NONE_NUMBER_MIN),
+                info="Threshold for language detection probability"
+            ),
+            gr.Number(
+                label="Language Detection Segments",
+                value=defaults.get("language_detection_segments",
+                                   cls.__fields__["language_detection_segments"].default),
+                precision=0,
+                info="Number of segments for language detection"
+            )
+        ]
+        insanely_fast_whisper_inputs = [
+            gr.Number(
+                label="Batch Size",
+                value=defaults.get("batch_size", cls.__fields__["batch_size"].default),
+                precision=0,
+                info="Batch size for processing"
+            )
+        ]
+        if whisper_type != WhisperImpl.FASTER_WHISPER.value:
+            for input_component in faster_whisper_inputs:
+                input_component.visible = False
+        if whisper_type != WhisperImpl.INSANELY_FAST_WHISPER.value:
+            for input_component in insanely_fast_whisper_inputs:
+                input_component.visible = False
+        inputs += faster_whisper_inputs + insanely_fast_whisper_inputs
+        return inputs
+class TranscriptionPipelineParams(BaseModel):
+    """Transcription pipeline parameters"""
+    whisper: WhisperParams = Field(default_factory=WhisperParams)
+    vad: VadParams = Field(default_factory=VadParams)
+    diarization: DiarizationParams = Field(default_factory=DiarizationParams)
+    bgm_separation: BGMSeparationParams = Field(default_factory=BGMSeparationParams)
+    def to_dict(self) -> Dict:
+        data = {
+            "whisper": self.whisper.to_dict(),
+            "vad": self.vad.to_dict(),
+            "diarization": self.diarization.to_dict(),
+            "bgm_separation": self.bgm_separation.to_dict()
+        }
+        return data
+    def to_list(self) -> List:
+        """
+        Convert data class to the list because I have to pass the parameters as a list in the gradio.
+        Related Gradio issue: https://github.com/gradio-app/gradio/issues/2471
+        See more about Gradio pre-processing: https://www.gradio.app/docs/components
+        """
+        whisper_list = self.whisper.to_list()
+        vad_list = self.vad.to_list()
+        diarization_list = self.diarization.to_list()
+        bgm_sep_list = self.bgm_separation.to_list()
+        return whisper_list + vad_list + diarization_list + bgm_sep_list
+    @staticmethod
+    def from_list(pipeline_list: List) -> 'TranscriptionPipelineParams':
+        """Convert list to the data class again to use it in a function."""
+        data_list = deepcopy(pipeline_list)
+        whisper_list = data_list[0:len(WhisperParams.__annotations__)]
+        data_list = data_list[len(WhisperParams.__annotations__):]
+        vad_list = data_list[0:len(VadParams.__annotations__)]
+        data_list = data_list[len(VadParams.__annotations__):]
+        diarization_list = data_list[0:len(DiarizationParams.__annotations__)]
+        data_list = data_list[len(DiarizationParams.__annotations__):]
+        bgm_sep_list = data_list[0:len(BGMSeparationParams.__annotations__)]
+        return TranscriptionPipelineParams(
+            whisper=WhisperParams.from_list(whisper_list),
+            vad=VadParams.from_list(vad_list),
+            diarization=DiarizationParams.from_list(diarization_list),
+            bgm_separation=BGMSeparationParams.from_list(bgm_sep_list)
+        )

modules/whisper/faster_whisper_inference.py ADDED Viewed

	@@ -0,0 +1,176 @@

+import os
+import time
+import numpy as np
+import torch
+from typing import BinaryIO, Union, Tuple, List
+import faster_whisper
+from faster_whisper.vad import VadOptions
+import ast
+import ctranslate2
+import whisper
+import gradio as gr
+from argparse import Namespace
+from modules.utils.paths import (FASTER_WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, UVR_MODELS_DIR, OUTPUT_DIR)
+from modules.whisper.data_classes import *
+from modules.whisper.base_transcription_pipeline import BaseTranscriptionPipeline
+class FasterWhisperInference(BaseTranscriptionPipeline):
+    def __init__(self,
+                 model_dir: str = FASTER_WHISPER_MODELS_DIR,
+                 diarization_model_dir: str = DIARIZATION_MODELS_DIR,
+                 uvr_model_dir: str = UVR_MODELS_DIR,
+                 output_dir: str = OUTPUT_DIR,
+                 ):
+        super().__init__(
+            model_dir=model_dir,
+            diarization_model_dir=diarization_model_dir,
+            uvr_model_dir=uvr_model_dir,
+            output_dir=output_dir
+        )
+        self.model_dir = model_dir
+        os.makedirs(self.model_dir, exist_ok=True)
+        self.model_paths = self.get_model_paths()
+        self.device = self.get_device()
+        self.available_models = self.model_paths.keys()
+    def transcribe(self,
+                   audio: Union[str, BinaryIO, np.ndarray],
+                   progress: gr.Progress = gr.Progress(),
+                   *whisper_params,
+                   ) -> Tuple[List[Segment], float]:
+        """
+        transcribe method for faster-whisper.
+        Parameters
+        ----------
+        audio: Union[str, BinaryIO, np.ndarray]
+            Audio path or file binary or Audio numpy array
+        progress: gr.Progress
+            Indicator to show progress directly in gradio.
+        *whisper_params: tuple
+            Parameters related with whisper. This will be dealt with "WhisperParameters" data class
+        Returns
+        ----------
+        segments_result: List[Segment]
+            list of Segment that includes start, end timestamps and transcribed text
+        elapsed_time: float
+            elapsed time for transcription
+        """
+        start_time = time.time()
+        params = WhisperParams.from_list(list(whisper_params))
+        if params.model_size != self.current_model_size or self.model is None or self.current_compute_type != params.compute_type:
+            self.update_model(params.model_size, params.compute_type, progress)
+        segments, info = self.model.transcribe(
+            audio=audio,
+            language=params.lang,
+            task="translate" if params.is_translate and self.current_model_size in self.translatable_models else "transcribe",
+            beam_size=params.beam_size,
+            log_prob_threshold=params.log_prob_threshold,
+            no_speech_threshold=params.no_speech_threshold,
+            best_of=params.best_of,
+            patience=params.patience,
+            temperature=params.temperature,
+            initial_prompt=params.initial_prompt,
+            compression_ratio_threshold=params.compression_ratio_threshold,
+            length_penalty=params.length_penalty,
+            repetition_penalty=params.repetition_penalty,
+            no_repeat_ngram_size=params.no_repeat_ngram_size,
+            prefix=params.prefix,
+            suppress_blank=params.suppress_blank,
+            suppress_tokens=params.suppress_tokens,
+            max_initial_timestamp=params.max_initial_timestamp,
+            word_timestamps=params.word_timestamps,
+            prepend_punctuations=params.prepend_punctuations,
+            append_punctuations=params.append_punctuations,
+            max_new_tokens=params.max_new_tokens,
+            chunk_length=params.chunk_length,
+            hallucination_silence_threshold=params.hallucination_silence_threshold,
+            hotwords=params.hotwords,
+            language_detection_threshold=params.language_detection_threshold,
+            language_detection_segments=params.language_detection_segments,
+            prompt_reset_on_temperature=params.prompt_reset_on_temperature,
+        )
+        progress(0, desc="Loading audio..")
+        segments_result = []
+        for segment in segments:
+            progress(segment.start / info.duration, desc="Transcribing..")
+            segments_result.append(Segment.from_faster_whisper(segment))
+        elapsed_time = time.time() - start_time
+        return segments_result, elapsed_time
+    def update_model(self,
+                     model_size: str,
+                     compute_type: str,
+                     progress: gr.Progress = gr.Progress()
+                     ):
+        """
+        Update current model setting
+        Parameters
+        ----------
+        model_size: str
+            Size of whisper model
+        compute_type: str
+            Compute type for transcription.
+            see more info : https://opennmt.net/CTranslate2/quantization.html
+        progress: gr.Progress
+            Indicator to show progress directly in gradio.
+        """
+        progress(0, desc="Initializing Model..")
+        self.current_model_size = self.model_paths[model_size]
+        self.current_compute_type = compute_type
+        self.model = faster_whisper.WhisperModel(
+            device=self.device,
+            model_size_or_path=self.current_model_size,
+            download_root=self.model_dir,
+            compute_type=self.current_compute_type
+        )
+    def get_model_paths(self):
+        """
+        Get available models from models path including fine-tuned model.
+        Returns
+        ----------
+        Name list of models
+        """
+        model_paths = {model:model for model in faster_whisper.available_models()}
+        faster_whisper_prefix = "models--Systran--faster-whisper-"
+        existing_models = os.listdir(self.model_dir)
+        wrong_dirs = [".locks"]
+        existing_models = list(set(existing_models) - set(wrong_dirs))
+        for model_name in existing_models:
+            if faster_whisper_prefix in model_name:
+                model_name = model_name[len(faster_whisper_prefix):]
+            if model_name not in whisper.available_models():
+                model_paths[model_name] = os.path.join(self.model_dir, model_name)
+        return model_paths
+    @staticmethod
+    def get_device():
+        if torch.cuda.is_available():
+            return "cuda"
+        else:
+            return "auto"
+    @staticmethod
+    def format_suppress_tokens_str(suppress_tokens_str: str) -> List[int]:
+        try:
+            suppress_tokens = ast.literal_eval(suppress_tokens_str)
+            if not isinstance(suppress_tokens, list) or not all(isinstance(item, int) for item in suppress_tokens):
+                raise ValueError("Invalid Suppress Tokens. The value must be type of List[int]")
+            return suppress_tokens
+        except Exception as e:
+            raise ValueError("Invalid Suppress Tokens. The value must be type of List[int]")

modules/whisper/insanely_fast_whisper_inference.py ADDED Viewed

	@@ -0,0 +1,195 @@

+import os
+import time
+import numpy as np
+from typing import BinaryIO, Union, Tuple, List
+import torch
+from transformers import pipeline
+from transformers.utils import is_flash_attn_2_available
+import gradio as gr
+from huggingface_hub import hf_hub_download
+import whisper
+from rich.progress import Progress, TimeElapsedColumn, BarColumn, TextColumn
+from argparse import Namespace
+from modules.utils.paths import (INSANELY_FAST_WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, UVR_MODELS_DIR, OUTPUT_DIR)
+from modules.whisper.data_classes import *
+from modules.whisper.base_transcription_pipeline import BaseTranscriptionPipeline
+class InsanelyFastWhisperInference(BaseTranscriptionPipeline):
+    def __init__(self,
+                 model_dir: str = INSANELY_FAST_WHISPER_MODELS_DIR,
+                 diarization_model_dir: str = DIARIZATION_MODELS_DIR,
+                 uvr_model_dir: str = UVR_MODELS_DIR,
+                 output_dir: str = OUTPUT_DIR,
+                 ):
+        super().__init__(
+            model_dir=model_dir,
+            output_dir=output_dir,
+            diarization_model_dir=diarization_model_dir,
+            uvr_model_dir=uvr_model_dir
+        )
+        self.model_dir = model_dir
+        os.makedirs(self.model_dir, exist_ok=True)
+        self.available_models = self.get_model_paths()
+    def transcribe(self,
+                   audio: Union[str, np.ndarray, torch.Tensor],
+                   progress: gr.Progress = gr.Progress(),
+                   *whisper_params,
+                   ) -> Tuple[List[Segment], float]:
+        """
+        transcribe method for faster-whisper.
+        Parameters
+        ----------
+        audio: Union[str, BinaryIO, np.ndarray]
+            Audio path or file binary or Audio numpy array
+        progress: gr.Progress
+            Indicator to show progress directly in gradio.
+        *whisper_params: tuple
+            Parameters related with whisper. This will be dealt with "WhisperParameters" data class
+        Returns
+        ----------
+        segments_result: List[Segment]
+            list of Segment that includes start, end timestamps and transcribed text
+        elapsed_time: float
+            elapsed time for transcription
+        """
+        start_time = time.time()
+        params = WhisperParams.from_list(list(whisper_params))
+        if params.model_size != self.current_model_size or self.model is None or self.current_compute_type != params.compute_type:
+            self.update_model(params.model_size, params.compute_type, progress)
+        progress(0, desc="Transcribing...Progress is not shown in insanely-fast-whisper.")
+        with Progress(
+                TextColumn("[progress.description]{task.description}"),
+                BarColumn(style="yellow1", pulse_style="white"),
+                TimeElapsedColumn(),
+        ) as progress:
+            progress.add_task("[yellow]Transcribing...", total=None)
+            kwargs = {
+                "no_speech_threshold": params.no_speech_threshold,
+                "temperature": params.temperature,
+                "compression_ratio_threshold": params.compression_ratio_threshold,
+                "logprob_threshold": params.log_prob_threshold,
+            }
+            if self.current_model_size.endswith(".en"):
+                pass
+            else:
+                kwargs["language"] = params.lang
+                kwargs["task"] = "translate" if params.is_translate else "transcribe"
+            segments = self.model(
+                inputs=audio,
+                return_timestamps=True,
+                chunk_length_s=params.chunk_length,
+                batch_size=params.batch_size,
+                generate_kwargs=kwargs
+            )
+        segments_result = []
+        for item in segments["chunks"]:
+            start, end = item["timestamp"][0], item["timestamp"][1]
+            if end is None:
+                end = start
+            segments_result.append(Segment(
+                text=item["text"],
+                start=start,
+                end=end
+            ))
+        elapsed_time = time.time() - start_time
+        return segments_result, elapsed_time
+    def update_model(self,
+                     model_size: str,
+                     compute_type: str,
+                     progress: gr.Progress = gr.Progress(),
+                     ):
+        """
+        Update current model setting
+        Parameters
+        ----------
+        model_size: str
+            Size of whisper model
+        compute_type: str
+            Compute type for transcription.
+            see more info : https://opennmt.net/CTranslate2/quantization.html
+        progress: gr.Progress
+            Indicator to show progress directly in gradio.
+        """
+        progress(0, desc="Initializing Model..")
+        model_path = os.path.join(self.model_dir, model_size)
+        if not os.path.isdir(model_path) or not os.listdir(model_path):
+            self.download_model(
+                model_size=model_size,
+                download_root=model_path,
+                progress=progress
+            )
+        self.current_compute_type = compute_type
+        self.current_model_size = model_size
+        self.model = pipeline(
+            "automatic-speech-recognition",
+            model=os.path.join(self.model_dir, model_size),
+            torch_dtype=self.current_compute_type,
+            device=self.device,
+            model_kwargs={"attn_implementation": "flash_attention_2"} if is_flash_attn_2_available() else {"attn_implementation": "sdpa"},
+        )
+    def get_model_paths(self):
+        """
+        Get available models from models path including fine-tuned model.
+        Returns
+        ----------
+        Name set of models
+        """
+        openai_models = whisper.available_models()
+        distil_models = ["distil-large-v2", "distil-large-v3", "distil-medium.en", "distil-small.en"]
+        default_models = openai_models + distil_models
+        existing_models = os.listdir(self.model_dir)
+        wrong_dirs = [".locks"]
+        available_models = default_models + existing_models
+        available_models = [model for model in available_models if model not in wrong_dirs]
+        available_models = sorted(set(available_models), key=available_models.index)
+        return available_models
+    @staticmethod
+    def download_model(
+        model_size: str,
+        download_root: str,
+        progress: gr.Progress
+    ):
+        progress(0, 'Initializing model..')
+        print(f'Downloading {model_size} to "{download_root}"....')
+        os.makedirs(download_root, exist_ok=True)
+        download_list = [
+            "model.safetensors",
+            "config.json",
+            "generation_config.json",
+            "preprocessor_config.json",
+            "tokenizer.json",
+            "tokenizer_config.json",
+            "added_tokens.json",
+            "special_tokens_map.json",
+            "vocab.json",
+        ]
+        if model_size.startswith("distil"):
+            repo_id = f"distil-whisper/{model_size}"
+        else:
+            repo_id = f"openai/whisper-{model_size}"
+        for item in download_list:
+            hf_hub_download(repo_id=repo_id, filename=item, local_dir=download_root)

modules/whisper/whisper_Inference.py ADDED Viewed

	@@ -0,0 +1,111 @@

+import whisper
+import gradio as gr
+import time
+from typing import BinaryIO, Union, Tuple, List
+import numpy as np
+import torch
+import os
+from argparse import Namespace
+from modules.utils.paths import (WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR, UVR_MODELS_DIR)
+from modules.whisper.base_transcription_pipeline import BaseTranscriptionPipeline
+from modules.whisper.data_classes import *
+class WhisperInference(BaseTranscriptionPipeline):
+    def __init__(self,
+                 model_dir: str = WHISPER_MODELS_DIR,
+                 diarization_model_dir: str = DIARIZATION_MODELS_DIR,
+                 uvr_model_dir: str = UVR_MODELS_DIR,
+                 output_dir: str = OUTPUT_DIR,
+                 ):
+        super().__init__(
+            model_dir=model_dir,
+            output_dir=output_dir,
+            diarization_model_dir=diarization_model_dir,
+            uvr_model_dir=uvr_model_dir
+        )
+    def transcribe(self,
+                   audio: Union[str, np.ndarray, torch.Tensor],
+                   progress: gr.Progress = gr.Progress(),
+                   *whisper_params,
+                   ) -> Tuple[List[Segment], float]:
+        """
+        transcribe method for faster-whisper.
+        Parameters
+        ----------
+        audio: Union[str, BinaryIO, np.ndarray]
+            Audio path or file binary or Audio numpy array
+        progress: gr.Progress
+            Indicator to show progress directly in gradio.
+        *whisper_params: tuple
+            Parameters related with whisper. This will be dealt with "WhisperParameters" data class
+        Returns
+        ----------
+        segments_result: List[Segment]
+            list of Segment that includes start, end timestamps and transcribed text
+        elapsed_time: float
+            elapsed time for transcription
+        """
+        start_time = time.time()
+        params = WhisperParams.from_list(list(whisper_params))
+        if params.model_size != self.current_model_size or self.model is None or self.current_compute_type != params.compute_type:
+            self.update_model(params.model_size, params.compute_type, progress)
+        def progress_callback(progress_value):
+            progress(progress_value, desc="Transcribing..")
+        result = self.model.transcribe(audio=audio,
+                                       language=params.lang,
+                                       verbose=False,
+                                       beam_size=params.beam_size,
+                                       logprob_threshold=params.log_prob_threshold,
+                                       no_speech_threshold=params.no_speech_threshold,
+                                       task="translate" if params.is_translate and self.current_model_size in self.translatable_models else "transcribe",
+                                       fp16=True if params.compute_type == "float16" else False,
+                                       best_of=params.best_of,
+                                       patience=params.patience,
+                                       temperature=params.temperature,
+                                       compression_ratio_threshold=params.compression_ratio_threshold,
+                                       progress_callback=progress_callback,)["segments"]
+        segments_result = []
+        for segment in result:
+            segments_result.append(Segment(
+                start=segment["start"],
+                end=segment["end"],
+                text=segment["text"]
+            ))
+        elapsed_time = time.time() - start_time
+        return segments_result, elapsed_time
+    def update_model(self,
+                     model_size: str,
+                     compute_type: str,
+                     progress: gr.Progress = gr.Progress(),
+                     ):
+        """
+        Update current model setting
+        Parameters
+        ----------
+        model_size: str
+            Size of whisper model
+        compute_type: str
+            Compute type for transcription.
+            see more info : https://opennmt.net/CTranslate2/quantization.html
+        progress: gr.Progress
+            Indicator to show progress directly in gradio.
+        """
+        progress(0, desc="Initializing Model..")
+        self.current_compute_type = compute_type
+        self.current_model_size = model_size
+        self.model = whisper.load_model(
+            name=model_size,
+            device=self.device,
+            download_root=self.model_dir
+        )

modules/whisper/whisper_factory.py ADDED Viewed

	@@ -0,0 +1,84 @@

+from typing import Optional
+import os
+from modules.utils.paths import (FASTER_WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR,
+                                 INSANELY_FAST_WHISPER_MODELS_DIR, WHISPER_MODELS_DIR, UVR_MODELS_DIR)
+from modules.whisper.faster_whisper_inference import FasterWhisperInference
+from modules.whisper.whisper_Inference import WhisperInference
+from modules.whisper.insanely_fast_whisper_inference import InsanelyFastWhisperInference
+from modules.whisper.base_transcription_pipeline import BaseTranscriptionPipeline
+from modules.whisper.data_classes import *
+class WhisperFactory:
+    @staticmethod
+    def create_whisper_inference(
+        whisper_type: str,
+        whisper_model_dir: str = WHISPER_MODELS_DIR,
+        faster_whisper_model_dir: str = FASTER_WHISPER_MODELS_DIR,
+        insanely_fast_whisper_model_dir: str = INSANELY_FAST_WHISPER_MODELS_DIR,
+        diarization_model_dir: str = DIARIZATION_MODELS_DIR,
+        uvr_model_dir: str = UVR_MODELS_DIR,
+        output_dir: str = OUTPUT_DIR,
+    ) -> "BaseTranscriptionPipeline":
+        """
+        Create a whisper inference class based on the provided whisper_type.
+        Parameters
+        ----------
+        whisper_type : str
+            The type of Whisper implementation to use. Supported values (case-insensitive):
+            - "faster-whisper": https://github.com/openai/whisper
+            - "whisper": https://github.com/openai/whisper
+            - "insanely-fast-whisper": https://github.com/Vaibhavs10/insanely-fast-whisper
+        whisper_model_dir : str
+            Directory path for the Whisper model.
+        faster_whisper_model_dir : str
+            Directory path for the Faster Whisper model.
+        insanely_fast_whisper_model_dir : str
+            Directory path for the Insanely Fast Whisper model.
+        diarization_model_dir : str
+            Directory path for the diarization model.
+        uvr_model_dir : str
+            Directory path for the UVR model.
+        output_dir : str
+            Directory path where output files will be saved.
+        Returns
+        -------
+        BaseTranscriptionPipeline
+            An instance of the appropriate whisper inference class based on the whisper_type.
+        """
+        # Temporal fix of the bug : https://github.com/jhj0517/Whisper-WebUI/issues/144
+        os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
+        whisper_type = whisper_type.strip().lower()
+        if whisper_type == WhisperImpl.FASTER_WHISPER.value:
+            return FasterWhisperInference(
+                model_dir=faster_whisper_model_dir,
+                output_dir=output_dir,
+                diarization_model_dir=diarization_model_dir,
+                uvr_model_dir=uvr_model_dir
+            )
+        elif whisper_type == WhisperImpl.WHISPER.value:
+            return WhisperInference(
+                model_dir=whisper_model_dir,
+                output_dir=output_dir,
+                diarization_model_dir=diarization_model_dir,
+                uvr_model_dir=uvr_model_dir
+            )
+        elif whisper_type == WhisperImpl.INSANELY_FAST_WHISPER.value:
+            return InsanelyFastWhisperInference(
+                model_dir=insanely_fast_whisper_model_dir,
+                output_dir=output_dir,
+                diarization_model_dir=diarization_model_dir,
+                uvr_model_dir=uvr_model_dir
+            )
+        else:
+            return FasterWhisperInference(
+                model_dir=faster_whisper_model_dir,
+                output_dir=output_dir,
+                diarization_model_dir=diarization_model_dir,
+                uvr_model_dir=uvr_model_dir
+            )

notebook/whisper-webui.ipynb ADDED Viewed

	@@ -0,0 +1,134 @@

+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "source": [
+        "---\n",
+        "\n",
+        "📌 **This notebook has been updated [here](https://github.com/jhj0517/Whisper-WebUI.git)!**\n",
+        "\n",
+        "🖋 **Author**: [jhj0517](https://github.com/jhj0517/Whisper-WebUI/blob/master/notebook/whisper-webui.ipynb)\n",
+        "\n",
+        "😎 **Support the Project**:\n",
+        "\n",
+        "If you find this project useful, please consider supporting it:\n",
+        "\n",
+        "<a href=\"https://ko-fi.com/jhj0517\" target=\"_blank\">\n",
+        "    <img src=\"https://storage.ko-fi.com/cdn/kofi2.png?v=3\" alt=\"Buy Me a Coffee at ko-fi.com\" height=\"36\">\n",
+        "</a>\n",
+        "\n",
+        "---"
+      ],
+      "metadata": {
+        "id": "doKhBBXIfS21"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "#@title #(Optional) Check GPU\n",
+        "#@markdown Some models may not function correctly on a CPU runtime.\n",
+        "\n",
+        "#@markdown so you should check your GPU setup before run.\n",
+        "!nvidia-smi"
+      ],
+      "metadata": {
+        "id": "23yZvUlagEsx",
+        "cellView": "form"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "kNbSbsctxahq",
+        "cellView": "form"
+      },
+      "outputs": [],
+      "source": [
+        "#@title #Installation\n",
+        "#@markdown This cell will install dependencies for Whisper-WebUI!\n",
+        "!git clone https://github.com/jhj0517/Whisper-WebUI.git\n",
+        "%cd Whisper-WebUI\n",
+        "!pip install git+https://github.com/jhj0517/jhj0517-whisper.git\n",
+        "!pip install faster-whisper==1.0.3\n",
+        "!pip install ctranslate2==4.4.0\n",
+        "!pip install gradio\n",
+        "!pip install gradio-i18n\n",
+        "# Temporal bug fix from https://github.com/jhj0517/Whisper-WebUI/issues/256\n",
+        "!pip install git+https://github.com/JuanBindez/pytubefix.git\n",
+        "!pip install tokenizers==0.19.1\n",
+        "!pip install pyannote.audio==3.3.1\n",
+        "!pip install git+https://github.com/jhj0517/ultimatevocalremover_api.git"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "#@title # (Optional) Configure arguments\n",
+        "#@markdown This section is used to configure some command line arguments.\n",
+        "\n",
+        "#@markdown You can simply ignore this section and the default values will be used.\n",
+        "\n",
+        "USERNAME = '' #@param {type: \"string\"}\n",
+        "PASSWORD = '' #@param {type: \"string\"}\n",
+        "WHISPER_TYPE = 'faster-whisper' # @param [\"whisper\", \"faster-whisper\", \"insanely-fast-whisper\"]\n",
+        "THEME = '' #@param {type: \"string\"}\n",
+        "\n",
+        "arguments = \"\"\n",
+        "if USERNAME:\n",
+        "  arguments += f\" --username {USERNAME}\"\n",
+        "if PASSWORD:\n",
+        "  arguments += f\" --password {PASSWORD}\"\n",
+        "if THEME:\n",
+        "  arguments += f\" --theme {THEME}\"\n",
+        "if WHISPER_TYPE:\n",
+        "  arguments += f\" --whisper_type {WHISPER_TYPE}\"\n",
+        "\n",
+        "\n",
+        "#@markdown If you wonder how these arguments are used, you can see the [Wiki](https://github.com/jhj0517/Whisper-WebUI/wiki/Command-Line-Arguments)."
+      ],
+      "metadata": {
+        "id": "Qosz9BFlGui3",
+        "cellView": "form"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 3,
+      "metadata": {
+        "id": "PQroYRRZzQiN",
+        "cellView": "form"
+      },
+      "outputs": [],
+      "source": [
+        "#@title #Run\n",
+        "#@markdown Once the installation is complete, you can use public URL that is displayed.\n",
+        "if 'arguments' in locals():\n",
+        "  !python app.py --share --colab{arguments}\n",
+        "else:\n",
+        "    !python app.py --share --colab"
+      ]
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "provenance": [],
+      "gpuType": "T4"
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    },
+    "accelerator": "GPU"
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}

requirements.txt ADDED Viewed

	@@ -0,0 +1,18 @@

+# Remove the --extra-index-url line below if you're not using Nvidia GPU.
+# If you're using it, update url to your CUDA version (CUDA 12.1 is minimum requirement):
+# For CUDA 12.1, use : https://download.pytorch.org/whl/cu121
+# For CUDA 12.4, use : https://download.pytorch.org/whl/cu124
+--extra-index-url https://download.pytorch.org/whl/cu124
+gradio==4.44.1
+torch
+torchaudio
+git+https://github.com/jhj0517/jhj0517-whisper.git
+faster-whisper==1.0.3
+transformers
+gradio-i18n
+pytubefix
+ruamel.yaml==0.18.6
+pyannote.audio==3.3.1
+git+https://github.com/jhj0517/ultimatevocalremover_api.git
+git+https://github.com/jhj0517/pyrubberband.git

start-webui.bat ADDED Viewed

	@@ -0,0 +1,7 @@

+@echo off
+call venv\scripts\activate
+python app.py %*
+echo "launching the app"
+pause

start-webui.sh ADDED Viewed

	@@ -0,0 +1,6 @@

+#!/bin/bash
+source venv/bin/activate
+python app.py "$@"
+echo "launching the app"

tests/test_bgm_separation.py ADDED Viewed

	@@ -0,0 +1,53 @@

+from modules.utils.paths import *
+from modules.whisper.whisper_factory import WhisperFactory
+from modules.whisper.data_classes import *
+from test_config import *
+from test_transcription import download_file, test_transcribe
+import gradio as gr
+import pytest
+import torch
+import os
+@pytest.mark.skipif(
+    not is_cuda_available(),
+    reason="Skipping because the test only works on GPU"
+)
+@pytest.mark.parametrize(
+    "whisper_type,vad_filter,bgm_separation,diarization",
+    [
+        (WhisperImpl.WHISPER.value, False, True, False),
+        (WhisperImpl.FASTER_WHISPER.value, False, True, False),
+        (WhisperImpl.INSANELY_FAST_WHISPER.value, False, True, False)
+    ]
+)
+def test_bgm_separation_pipeline(
+    whisper_type: str,
+    vad_filter: bool,
+    bgm_separation: bool,
+    diarization: bool,
+):
+    test_transcribe(whisper_type, vad_filter, bgm_separation, diarization)
+@pytest.mark.skipif(
+    not is_cuda_available(),
+    reason="Skipping because the test only works on GPU"
+)
+@pytest.mark.parametrize(
+    "whisper_type,vad_filter,bgm_separation,diarization",
+    [
+        (WhisperImpl.WHISPER.value, True, True, False),
+        (WhisperImpl.FASTER_WHISPER.value, True, True, False),
+        (WhisperImpl.INSANELY_FAST_WHISPER.value, True, True, False)
+    ]
+)
+def test_bgm_separation_with_vad_pipeline(
+    whisper_type: str,
+    vad_filter: bool,
+    bgm_separation: bool,
+    diarization: bool,
+):
+    test_transcribe(whisper_type, vad_filter, bgm_separation, diarization)

tests/test_config.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import functools
+import jiwer
+import os
+import torch
+from modules.utils.paths import *
+from modules.utils.youtube_manager import *
+TEST_FILE_DOWNLOAD_URL = "https://github.com/jhj0517/whisper_flutter_new/raw/main/example/assets/jfk.wav"
+TEST_FILE_PATH = os.path.join(WEBUI_DIR, "tests", "jfk.wav")
+TEST_ANSWER = "And so my fellow Americans ask not what your country can do for you ask what you can do for your country"
+TEST_YOUTUBE_URL = "https://www.youtube.com/watch?v=4WEQtgnBu0I&ab_channel=AndriaFitzer"
+TEST_WHISPER_MODEL = "tiny"
+TEST_UVR_MODEL = "UVR-MDX-NET-Inst_HQ_4"
+TEST_NLLB_MODEL = "facebook/nllb-200-distilled-600M"
+TEST_SUBTITLE_SRT_PATH = os.path.join(WEBUI_DIR, "tests", "test_srt.srt")
+TEST_SUBTITLE_VTT_PATH = os.path.join(WEBUI_DIR, "tests", "test_vtt.vtt")
+@functools.lru_cache
+def is_cuda_available():
+    return torch.cuda.is_available()
+@functools.lru_cache
+def is_pytube_detected_bot(url: str = TEST_YOUTUBE_URL):
+    try:
+        yt_temp_path = os.path.join("modules", "yt_tmp.wav")
+        if os.path.exists(yt_temp_path):
+            return False
+        yt = get_ytdata(url)
+        audio = get_ytaudio(yt)
+        return False
+    except Exception as e:
+        print(f"Pytube has detected as a bot: {e}")
+        return True
+def calculate_wer(answer, prediction):
+    return jiwer.wer(answer, prediction)

tests/test_diarization.py ADDED Viewed

	@@ -0,0 +1,31 @@

+from modules.utils.paths import *
+from modules.whisper.whisper_factory import WhisperFactory
+from modules.whisper.data_classes import *
+from test_config import *
+from test_transcription import download_file, test_transcribe
+import gradio as gr
+import pytest
+import os
+@pytest.mark.skipif(
+    not is_cuda_available(),
+    reason="Skipping because the test only works on GPU"
+)
+@pytest.mark.parametrize(
+    "whisper_type,vad_filter,bgm_separation,diarization",
+    [
+        (WhisperImpl.WHISPER.value, False, False, True),
+        (WhisperImpl.FASTER_WHISPER.value, False, False, True),
+        (WhisperImpl.INSANELY_FAST_WHISPER.value, False, False, True)
+    ]
+)
+def test_diarization_pipeline(
+    whisper_type: str,
+    vad_filter: bool,
+    bgm_separation: bool,
+    diarization: bool,
+):
+    test_transcribe(whisper_type, vad_filter, bgm_separation, diarization)

tests/test_srt.srt ADDED Viewed

	@@ -0,0 +1,7 @@

+1
+00:00:00,000 --> 00:00:02,240
+You've got
+2
+00:00:02,240 --> 00:00:04,160
+a friend in me.

tests/test_transcription.py ADDED Viewed

	@@ -0,0 +1,110 @@

+from modules.whisper.whisper_factory import WhisperFactory
+from modules.whisper.data_classes import *
+from modules.utils.subtitle_manager import read_file
+from modules.utils.paths import WEBUI_DIR
+from test_config import *
+import requests
+import pytest
+import gradio as gr
+import os
+@pytest.mark.parametrize(
+    "whisper_type,vad_filter,bgm_separation,diarization",
+    [
+        (WhisperImpl.WHISPER.value, False, False, False),
+        (WhisperImpl.FASTER_WHISPER.value, False, False, False),
+        (WhisperImpl.INSANELY_FAST_WHISPER.value, False, False, False)
+    ]
+)
+def test_transcribe(
+    whisper_type: str,
+    vad_filter: bool,
+    bgm_separation: bool,
+    diarization: bool,
+):
+    audio_path_dir = os.path.join(WEBUI_DIR, "tests")
+    audio_path = os.path.join(audio_path_dir, "jfk.wav")
+    if not os.path.exists(audio_path):
+        download_file(TEST_FILE_DOWNLOAD_URL, audio_path_dir)
+    answer = TEST_ANSWER
+    if diarization:
+        answer = "SPEAKER_00|"+TEST_ANSWER
+    whisper_inferencer = WhisperFactory.create_whisper_inference(
+        whisper_type=whisper_type,
+    )
+    print(
+        f"""Whisper Device : {whisper_inferencer.device}\n"""
+        f"""BGM Separation Device: {whisper_inferencer.music_separator.device}\n"""
+        f"""Diarization Device: {whisper_inferencer.diarizer.device}"""
+    )
+    hparams = TranscriptionPipelineParams(
+        whisper=WhisperParams(
+            model_size=TEST_WHISPER_MODEL,
+            compute_type=whisper_inferencer.current_compute_type
+        ),
+        vad=VadParams(
+            vad_filter=vad_filter
+        ),
+        bgm_separation=BGMSeparationParams(
+            is_separate_bgm=bgm_separation,
+            enable_offload=True
+        ),
+        diarization=DiarizationParams(
+            is_diarize=diarization
+        ),
+    ).to_list()
+    subtitle_str, file_paths = whisper_inferencer.transcribe_file(
+        [audio_path],
+        None,
+        "SRT",
+        False,
+        gr.Progress(),
+        *hparams,
+    )
+    subtitle = read_file(file_paths[0]).split("\n")
+    assert calculate_wer(answer, subtitle[2].strip().replace(",", "").replace(".", "")) < 0.1
+    if not is_pytube_detected_bot():
+        subtitle_str, file_path = whisper_inferencer.transcribe_youtube(
+            TEST_YOUTUBE_URL,
+            "SRT",
+            False,
+            gr.Progress(),
+            *hparams,
+        )
+        assert isinstance(subtitle_str, str) and subtitle_str
+        assert os.path.exists(file_path)
+    subtitle_str, file_path = whisper_inferencer.transcribe_mic(
+        audio_path,
+        "SRT",
+        False,
+        gr.Progress(),
+        *hparams,
+    )
+    subtitle = read_file(file_path).split("\n")
+    assert calculate_wer(answer, subtitle[2].strip().replace(",", "").replace(".", "")) < 0.1
+def download_file(url, save_dir):
+    if os.path.exists(TEST_FILE_PATH):
+        return
+    if not os.path.exists(save_dir):
+        os.makedirs(save_dir)
+    file_name = url.split("/")[-1]
+    file_path = os.path.join(save_dir, file_name)
+    response = requests.get(url)
+    with open(file_path, "wb") as file:
+        file.write(response.content)
+    print(f"File downloaded to: {file_path}")

tests/test_translation.py ADDED Viewed

	@@ -0,0 +1,56 @@

+from modules.translation.deepl_api import DeepLAPI
+from modules.translation.nllb_inference import NLLBInference
+from test_config import *
+import os
+import pytest
+@pytest.mark.parametrize("model_size, file_path", [
+    (TEST_NLLB_MODEL, TEST_SUBTITLE_SRT_PATH),
+    (TEST_NLLB_MODEL, TEST_SUBTITLE_VTT_PATH),
+])
+def test_nllb_inference(
+    model_size: str,
+    file_path: str
+):
+    nllb_inferencer = NLLBInference()
+    print(f"NLLB Device : {nllb_inferencer.device}")
+    result_str, file_paths = nllb_inferencer.translate_file(
+        fileobjs=[file_path],
+        model_size=model_size,
+        src_lang="eng_Latn",
+        tgt_lang="kor_Hang",
+    )
+    assert isinstance(result_str, str)
+    assert isinstance(file_paths[0], str)
+@pytest.mark.skipif(
+    os.getenv("DEEPL_API_KEY") is None or not os.getenv("DEEPL_API_KEY"),
+    reason="DeepL API key is unavailable"
+)
+@pytest.mark.parametrize("file_path", [
+    TEST_SUBTITLE_SRT_PATH,
+    TEST_SUBTITLE_VTT_PATH,
+])
+def test_deepl_api(
+    file_path: str
+):
+    deepl_api = DeepLAPI()
+    api_key = os.getenv("DEEPL_API_KEY")
+    result_str, file_paths = deepl_api.translate_deepl(
+        auth_key=api_key,
+        fileobjs=[file_path],
+        source_lang="English",
+        target_lang="Korean",
+        is_pro=False,
+        add_timestamp=True,
+    )
+    assert isinstance(result_str, str)
+    assert isinstance(file_paths[0], str)

tests/test_vad.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from modules.utils.paths import *
+from modules.whisper.whisper_factory import WhisperFactory
+from modules.whisper.data_classes import *
+from test_config import *
+from test_transcription import download_file, test_transcribe
+import gradio as gr
+import pytest
+import os
+@pytest.mark.parametrize(
+    "whisper_type,vad_filter,bgm_separation,diarization",
+    [
+        (WhisperImpl.WHISPER.value, True, False, False),
+        (WhisperImpl.FASTER_WHISPER.value, True, False, False),
+        (WhisperImpl.INSANELY_FAST_WHISPER.value, True, False, False)
+    ]
+)
+def test_vad_pipeline(
+    whisper_type: str,
+    vad_filter: bool,
+    bgm_separation: bool,
+    diarization: bool,
+):
+    test_transcribe(whisper_type, vad_filter, bgm_separation, diarization)