Spaces:

jhj0517
/

Whisper-WebUI

Running

App Files Files Community

jhj0517 commited on Jun 26, 2024

Commit

201b316

1 Parent(s): 595b5f3

add `--diarization_model_dir` cli arg

Browse files

Files changed (8) hide show

app.py +36 -16
modules/diarize_pipeline.py +1 -1
modules/diarizer.py +1 -1
modules/faster_whisper_inference.py +5 -2
modules/insanely_fast_whisper_inference.py +5 -2
modules/whisper_Inference.py +5 -2
modules/whisper_base.py +6 -2
modules/whisper_parameter.py +6 -0

app.py CHANGED Viewed

@@ -36,23 +36,27 @@ class App:
         if whisper_type in ["faster_whisper", "faster-whisper", "fasterwhisper"]:
             whisper_inf = FasterWhisperInference(
                 model_dir=self.args.faster_whisper_model_dir,
-                output_dir=self.args.output_dir
             )
         elif whisper_type in ["whisper"]:
             whisper_inf = WhisperInference(
                 model_dir=self.args.whisper_model_dir,
-                output_dir=self.args.output_dir
             )
         elif whisper_type in ["insanely_fast_whisper", "insanely-fast-whisper", "insanelyfastwhisper",
                               "insanely_faster_whisper", "insanely-faster-whisper", "insanelyfasterwhisper"]:
             whisper_inf = InsanelyFastWhisperInference(
                 model_dir=self.args.insanely_fast_whisper_model_dir,
-                output_dir=self.args.output_dir
             )
         else:
             whisper_inf = FasterWhisperInference(
                 model_dir=self.args.faster_whisper_model_dir,
-                output_dir=self.args.output_dir
             )
         return whisper_inf
@@ -90,7 +94,7 @@ class App:
                         cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
                     with gr.Row():
                         cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename", interactive=True)
-                    with gr.Accordion("Advanced_Parameters", open=False):
                         nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
                         nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
                         nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
@@ -101,7 +105,7 @@ class App:
                         tb_initial_prompt = gr.Textbox(label="Initial Prompt", value=None, interactive=True)
                         sd_temperature = gr.Slider(label="Temperature", value=0, step=0.01, maximum=1.0, interactive=True)
                         nb_compression_ratio_threshold = gr.Number(label="Compression Ratio Threshold", value=2.4, interactive=True)
-                    with gr.Accordion("VAD Options", open=False, visible=isinstance(self.whisper_inf, FasterWhisperInference)):
                         cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
                         sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5)
                         nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
@@ -109,12 +113,14 @@ class App:
                         nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=2000)
                         nb_window_size_sample = gr.Number(label="Window Size (samples)", precision=0, value=1024)
                         nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
                     with gr.Accordion("Insanely Fast Whisper Parameters", open=False, visible=isinstance(self.whisper_inf, InsanelyFastWhisperInference)):
                         nb_chunk_length_s = gr.Number(label="Chunk Lengths (sec)", value=30, precision=0)
                         nb_batch_size = gr.Number(label="Batch Size", value=24, precision=0)
-                    with gr.Accordion("Diarization Parameters", open=False):
-                        cb_diarize = gr.Checkbox(label="Enable Diarization")
-                        tb_hf_token = gr.Text(label="HuggingFace Token", value="")
                     with gr.Row():
                         btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
                     with gr.Row():
@@ -146,7 +152,8 @@ class App:
                                                        chunk_length_s=nb_chunk_length_s,
                                                        batch_size=nb_batch_size,
                                                        is_diarize=cb_diarize,
-                                                       hf_token=tb_hf_token)
                     btn_run.click(fn=self.whisper_inf.transcribe_file,
                                   inputs=params + whisper_params.as_list(),
@@ -174,7 +181,7 @@ class App:
                     with gr.Row():
                         cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename",
                                                    interactive=True)
-                    with gr.Accordion("Advanced_Parameters", open=False):
                         nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
                         nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
                         nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
@@ -185,7 +192,7 @@ class App:
                         tb_initial_prompt = gr.Textbox(label="Initial Prompt", value=None, interactive=True)
                         sd_temperature = gr.Slider(label="Temperature", value=0, step=0.01, maximum=1.0, interactive=True)
                         nb_compression_ratio_threshold = gr.Number(label="Compression Ratio Threshold", value=2.4, interactive=True)
-                    with gr.Accordion("VAD Options", open=False, visible=isinstance(self.whisper_inf, FasterWhisperInference)):
                         cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
                         sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5)
                         nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
@@ -193,6 +200,11 @@ class App:
                         nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=2000)
                         nb_window_size_sample = gr.Number(label="Window Size (samples)", precision=0, value=1024)
                         nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
                     with gr.Accordion("Insanely Fast Whisper Parameters", open=False,
                                       visible=isinstance(self.whisper_inf, InsanelyFastWhisperInference)):
                         nb_chunk_length_s = gr.Number(label="Chunk Lengths (sec)", value=30, precision=0)
@@ -228,7 +240,8 @@ class App:
                                                        chunk_length_s=nb_chunk_length_s,
                                                        batch_size=nb_batch_size,
                                                        is_diarize=cb_diarize,
-                                                       hf_token=tb_hf_token)
                     btn_run.click(fn=self.whisper_inf.transcribe_youtube,
                                   inputs=params + whisper_params.as_list(),
@@ -249,7 +262,7 @@ class App:
                         dd_file_format = gr.Dropdown(["SRT", "WebVTT", "txt"], value="SRT", label="File Format")
                     with gr.Row():
                         cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
-                    with gr.Accordion("Advanced_Parameters", open=False):
                         nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
                         nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
                         nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
@@ -259,7 +272,7 @@ class App:
                         cb_condition_on_previous_text = gr.Checkbox(label="Condition On Previous Text", value=True, interactive=True)
                         tb_initial_prompt = gr.Textbox(label="Initial Prompt", value=None, interactive=True)
                         sd_temperature = gr.Slider(label="Temperature", value=0, step=0.01, maximum=1.0, interactive=True)
-                    with gr.Accordion("VAD Options", open=False, visible=isinstance(self.whisper_inf, FasterWhisperInference)):
                         cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
                         sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5)
                         nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
@@ -267,6 +280,11 @@ class App:
                         nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=2000)
                         nb_window_size_sample = gr.Number(label="Window Size (samples)", precision=0, value=1024)
                         nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
                     with gr.Accordion("Insanely Fast Whisper Parameters", open=False,
                                       visible=isinstance(self.whisper_inf, InsanelyFastWhisperInference)):
                         nb_chunk_length_s = gr.Number(label="Chunk Lengths (sec)", value=30, precision=0)
@@ -302,7 +320,8 @@ class App:
                                                        chunk_length_s=nb_chunk_length_s,
                                                        batch_size=nb_batch_size,
                                                        is_diarize=cb_diarize,
-                                                       hf_token=tb_hf_token)
                     btn_run.click(fn=self.whisper_inf.transcribe_mic,
                                   inputs=params + whisper_params.as_list(),
@@ -404,6 +423,7 @@ parser.add_argument('--api_open', type=bool, default=False, nargs='?', const=Tru
 parser.add_argument('--whisper_model_dir', type=str, default=os.path.join("models", "Whisper"), help='Directory path of the whisper model')
 parser.add_argument('--faster_whisper_model_dir', type=str, default=os.path.join("models", "Whisper", "faster-whisper"), help='Directory path of the faster-whisper model')
 parser.add_argument('--insanely_fast_whisper_model_dir', type=str, default=os.path.join("models", "Whisper", "insanely-fast-whisper"), help='Directory path of the insanely-fast-whisper model')
 parser.add_argument('--nllb_model_dir', type=str, default=os.path.join("models", "NLLB"), help='Directory path of the Facebook NLLB model')
 parser.add_argument('--output_dir', type=str, default=os.path.join("outputs"), help='Directory path of the outputs')
 _args = parser.parse_args()

         if whisper_type in ["faster_whisper", "faster-whisper", "fasterwhisper"]:
             whisper_inf = FasterWhisperInference(
                 model_dir=self.args.faster_whisper_model_dir,
+                output_dir=self.args.output_dir,
+                args=self.args
             )
         elif whisper_type in ["whisper"]:
             whisper_inf = WhisperInference(
                 model_dir=self.args.whisper_model_dir,
+                output_dir=self.args.output_dir,
+                args=self.args
             )
         elif whisper_type in ["insanely_fast_whisper", "insanely-fast-whisper", "insanelyfastwhisper",
                               "insanely_faster_whisper", "insanely-faster-whisper", "insanelyfasterwhisper"]:
             whisper_inf = InsanelyFastWhisperInference(
                 model_dir=self.args.insanely_fast_whisper_model_dir,
+                output_dir=self.args.output_dir,
+                args=self.args
             )
         else:
             whisper_inf = FasterWhisperInference(
                 model_dir=self.args.faster_whisper_model_dir,
+                output_dir=self.args.output_dir,
+                args=self.args
             )
         return whisper_inf
                         cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
                     with gr.Row():
                         cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename", interactive=True)
+                    with gr.Accordion("Advanced Parameters", open=False):
                         nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
                         nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
                         nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
                         tb_initial_prompt = gr.Textbox(label="Initial Prompt", value=None, interactive=True)
                         sd_temperature = gr.Slider(label="Temperature", value=0, step=0.01, maximum=1.0, interactive=True)
                         nb_compression_ratio_threshold = gr.Number(label="Compression Ratio Threshold", value=2.4, interactive=True)
+                    with gr.Accordion("VAD", open=False, visible=isinstance(self.whisper_inf, FasterWhisperInference)):
                         cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
                         sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5)
                         nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
                         nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=2000)
                         nb_window_size_sample = gr.Number(label="Window Size (samples)", precision=0, value=1024)
                         nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
+                    with gr.Accordion("Diarization", open=False):
+                        cb_diarize = gr.Checkbox(label="Enable Diarization")
+                        tb_hf_token = gr.Text(label="HuggingFace Token", value="",
+                                              info="This is only needed the first time you download the model. If you already have models, you don't need to enter.")
+                        dd_diarization_device = gr.Dropdown(label="Device", choices=self.whisper_inf.diarizer.get_available_device(), value=self.whisper_inf.diarizer.get_device())
                     with gr.Accordion("Insanely Fast Whisper Parameters", open=False, visible=isinstance(self.whisper_inf, InsanelyFastWhisperInference)):
                         nb_chunk_length_s = gr.Number(label="Chunk Lengths (sec)", value=30, precision=0)
                         nb_batch_size = gr.Number(label="Batch Size", value=24, precision=0)
                     with gr.Row():
                         btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
                     with gr.Row():
                                                        chunk_length_s=nb_chunk_length_s,
                                                        batch_size=nb_batch_size,
                                                        is_diarize=cb_diarize,
+                                                       hf_token=tb_hf_token,
+                                                       diarization_device=dd_diarization_device)
                     btn_run.click(fn=self.whisper_inf.transcribe_file,
                                   inputs=params + whisper_params.as_list(),
                     with gr.Row():
                         cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename",
                                                    interactive=True)
+                    with gr.Accordion("Advanced Parameters", open=False):
                         nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
                         nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
                         nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
                         tb_initial_prompt = gr.Textbox(label="Initial Prompt", value=None, interactive=True)
                         sd_temperature = gr.Slider(label="Temperature", value=0, step=0.01, maximum=1.0, interactive=True)
                         nb_compression_ratio_threshold = gr.Number(label="Compression Ratio Threshold", value=2.4, interactive=True)
+                    with gr.Accordion("VAD", open=False, visible=isinstance(self.whisper_inf, FasterWhisperInference)):
                         cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
                         sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5)
                         nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
                         nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=2000)
                         nb_window_size_sample = gr.Number(label="Window Size (samples)", precision=0, value=1024)
                         nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
+                    with gr.Accordion("Diarization", open=False):
+                        cb_diarize = gr.Checkbox(label="Enable Diarization")
+                        tb_hf_token = gr.Text(label="HuggingFace Token", value="",
+                                              info="This is only needed the first time you download the model. If you already have models, you don't need to enter.")
+                        dd_diarization_device = gr.Dropdown(label="Device", choices=self.whisper_inf.diarizer.get_available_device(), value=self.whisper_inf.diarizer.get_device())
                     with gr.Accordion("Insanely Fast Whisper Parameters", open=False,
                                       visible=isinstance(self.whisper_inf, InsanelyFastWhisperInference)):
                         nb_chunk_length_s = gr.Number(label="Chunk Lengths (sec)", value=30, precision=0)
                                                        chunk_length_s=nb_chunk_length_s,
                                                        batch_size=nb_batch_size,
                                                        is_diarize=cb_diarize,
+                                                       hf_token=tb_hf_token,
+                                                       diarization_device=dd_diarization_device)
                     btn_run.click(fn=self.whisper_inf.transcribe_youtube,
                                   inputs=params + whisper_params.as_list(),
                         dd_file_format = gr.Dropdown(["SRT", "WebVTT", "txt"], value="SRT", label="File Format")
                     with gr.Row():
                         cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
+                    with gr.Accordion("Advanced Parameters", open=False):
                         nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
                         nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
                         nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
                         cb_condition_on_previous_text = gr.Checkbox(label="Condition On Previous Text", value=True, interactive=True)
                         tb_initial_prompt = gr.Textbox(label="Initial Prompt", value=None, interactive=True)
                         sd_temperature = gr.Slider(label="Temperature", value=0, step=0.01, maximum=1.0, interactive=True)
+                    with gr.Accordion("VAD", open=False, visible=isinstance(self.whisper_inf, FasterWhisperInference)):
                         cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
                         sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5)
                         nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
                         nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=2000)
                         nb_window_size_sample = gr.Number(label="Window Size (samples)", precision=0, value=1024)
                         nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
+                    with gr.Accordion("Diarization", open=False):
+                        cb_diarize = gr.Checkbox(label="Enable Diarization")
+                        tb_hf_token = gr.Text(label="HuggingFace Token", value="",
+                                              info="This is only needed the first time you download the model. If you already have models, you don't need to enter.")
+                        dd_diarization_device = gr.Dropdown(label="Device", choices=self.whisper_inf.diarizer.get_available_device(), value=self.whisper_inf.diarizer.get_device())
                     with gr.Accordion("Insanely Fast Whisper Parameters", open=False,
                                       visible=isinstance(self.whisper_inf, InsanelyFastWhisperInference)):
                         nb_chunk_length_s = gr.Number(label="Chunk Lengths (sec)", value=30, precision=0)
                                                        chunk_length_s=nb_chunk_length_s,
                                                        batch_size=nb_batch_size,
                                                        is_diarize=cb_diarize,
+                                                       hf_token=tb_hf_token,
+                                                       diarization_device=dd_diarization_device)
                     btn_run.click(fn=self.whisper_inf.transcribe_mic,
                                   inputs=params + whisper_params.as_list(),
 parser.add_argument('--whisper_model_dir', type=str, default=os.path.join("models", "Whisper"), help='Directory path of the whisper model')
 parser.add_argument('--faster_whisper_model_dir', type=str, default=os.path.join("models", "Whisper", "faster-whisper"), help='Directory path of the faster-whisper model')
 parser.add_argument('--insanely_fast_whisper_model_dir', type=str, default=os.path.join("models", "Whisper", "insanely-fast-whisper"), help='Directory path of the insanely-fast-whisper model')
+parser.add_argument('--diarization_model_dir', type=str, default=os.path.join("models", "Diarization"), help='Directory path of the diarization model')
 parser.add_argument('--nllb_model_dir', type=str, default=os.path.join("models", "NLLB"), help='Directory path of the Facebook NLLB model')
 parser.add_argument('--output_dir', type=str, default=os.path.join("outputs"), help='Directory path of the outputs')
 _args = parser.parse_args()

modules/diarize_pipeline.py CHANGED Viewed

@@ -11,7 +11,7 @@ class DiarizationPipeline:
     def __init__(
         self,
         model_name="pyannote/speaker-diarization-3.1",
-        cache_dir: str = os.path.join("models", "Whisper", "whisperx"),
         use_auth_token=None,
         device: Optional[Union[str, torch.device]] = "cpu",
     ):

     def __init__(
         self,
         model_name="pyannote/speaker-diarization-3.1",
+        cache_dir: str = os.path.join("models", "Diarization"),
         use_auth_token=None,
         device: Optional[Union[str, torch.device]] = "cpu",
     ):

modules/diarizer.py CHANGED Viewed

@@ -9,7 +9,7 @@ from modules.diarize_pipeline import DiarizationPipeline
 class Diarizer:
     def __init__(self,
-                 model_dir: str = os.path.join("models", "Whisper", "whisperx")
                  ):
         self.device = self.get_device()
         self.available_device = self.get_available_device()

 class Diarizer:
     def __init__(self,
+                 model_dir: str = os.path.join("models", "Diarization")
                  ):
         self.device = self.get_device()
         self.available_device = self.get_available_device()

modules/faster_whisper_inference.py CHANGED Viewed

@@ -7,6 +7,7 @@ from faster_whisper.vad import VadOptions
 import ctranslate2
 import whisper
 import gradio as gr
 from modules.whisper_parameter import *
 from modules.whisper_base import WhisperBase
@@ -15,11 +16,13 @@ from modules.whisper_base import WhisperBase
 class FasterWhisperInference(WhisperBase):
     def __init__(self,
                  model_dir: str,
-                 output_dir: str
                  ):
         super().__init__(
             model_dir=model_dir,
-            output_dir=output_dir
         )
         self.model_paths = self.get_model_paths()
         self.available_models = self.model_paths.keys()

 import ctranslate2
 import whisper
 import gradio as gr
+from argparse import Namespace
 from modules.whisper_parameter import *
 from modules.whisper_base import WhisperBase
 class FasterWhisperInference(WhisperBase):
     def __init__(self,
                  model_dir: str,
+                 output_dir: str,
+                 args: Namespace
                  ):
         super().__init__(
             model_dir=model_dir,
+            output_dir=output_dir,
+            args=args
         )
         self.model_paths = self.get_model_paths()
         self.available_models = self.model_paths.keys()

modules/insanely_fast_whisper_inference.py CHANGED Viewed

@@ -9,6 +9,7 @@ import gradio as gr
 from huggingface_hub import hf_hub_download
 import whisper
 from rich.progress import Progress, TimeElapsedColumn, BarColumn, TextColumn
 from modules.whisper_parameter import *
 from modules.whisper_base import WhisperBase
@@ -17,11 +18,13 @@ from modules.whisper_base import WhisperBase
 class InsanelyFastWhisperInference(WhisperBase):
     def __init__(self,
                  model_dir: str,
-                 output_dir: str
                  ):
         super().__init__(
             model_dir=model_dir,
-            output_dir=output_dir
         )
         openai_models = whisper.available_models()
         distil_models = ["distil-large-v2", "distil-large-v3", "distil-medium.en", "distil-small.en"]

 from huggingface_hub import hf_hub_download
 import whisper
 from rich.progress import Progress, TimeElapsedColumn, BarColumn, TextColumn
+from argparse import Namespace
 from modules.whisper_parameter import *
 from modules.whisper_base import WhisperBase
 class InsanelyFastWhisperInference(WhisperBase):
     def __init__(self,
                  model_dir: str,
+                 output_dir: str,
+                 args: Namespace
                  ):
         super().__init__(
             model_dir=model_dir,
+            output_dir=output_dir,
+            args=args
         )
         openai_models = whisper.available_models()
         distil_models = ["distil-large-v2", "distil-large-v3", "distil-medium.en", "distil-small.en"]

modules/whisper_Inference.py CHANGED Viewed

@@ -5,6 +5,7 @@ import os
 from typing import BinaryIO, Union, Tuple, List
 import numpy as np
 import torch
 from modules.whisper_base import WhisperBase
 from modules.whisper_parameter import *
@@ -13,11 +14,13 @@ from modules.whisper_parameter import *
 class WhisperInference(WhisperBase):
     def __init__(self,
                  model_dir: str,
-                 output_dir: str
                  ):
         super().__init__(
             model_dir=model_dir,
-            output_dir=output_dir
         )
     def transcribe(self,

 from typing import BinaryIO, Union, Tuple, List
 import numpy as np
 import torch
+from argparse import Namespace
 from modules.whisper_base import WhisperBase
 from modules.whisper_parameter import *
 class WhisperInference(WhisperBase):
     def __init__(self,
                  model_dir: str,
+                 output_dir: str,
+                 args: Namespace
                  ):
         super().__init__(
             model_dir=model_dir,
+            output_dir=output_dir,
+            args=args
         )
     def transcribe(self,

modules/whisper_base.py CHANGED Viewed

@@ -7,6 +7,7 @@ from abc import ABC, abstractmethod
 from typing import BinaryIO, Union, Tuple, List
 import numpy as np
 from datetime import datetime
 import time
 from modules.subtitle_manager import get_srt, get_vtt, get_txt, write_file, safe_filename
@@ -18,7 +19,8 @@ from modules.diarizer import Diarizer
 class WhisperBase(ABC):
     def __init__(self,
                  model_dir: str,
-                 output_dir: str
                  ):
         self.model = None
         self.current_model_size = None
@@ -32,7 +34,9 @@ class WhisperBase(ABC):
         self.device = self.get_device()
         self.available_compute_types = ["float16", "float32"]
         self.current_compute_type = "float16" if self.device == "cuda" else "float32"
-        self.diarizer = Diarizer()
     @abstractmethod
     def transcribe(self,

 from typing import BinaryIO, Union, Tuple, List
 import numpy as np
 from datetime import datetime
+from argparse import Namespace
 import time
 from modules.subtitle_manager import get_srt, get_vtt, get_txt, write_file, safe_filename
 class WhisperBase(ABC):
     def __init__(self,
                  model_dir: str,
+                 output_dir: str,
+                 args: Namespace
                  ):
         self.model = None
         self.current_model_size = None
         self.device = self.get_device()
         self.available_compute_types = ["float16", "float32"]
         self.current_compute_type = "float16" if self.device == "cuda" else "float32"
+        self.diarizer = Diarizer(
+            model_dir=args.diarization_model_dir
+        )
     @abstractmethod
     def transcribe(self,

modules/whisper_parameter.py CHANGED Viewed

@@ -29,6 +29,7 @@ class WhisperParameters:
     batch_size: gr.Number
     is_diarize: gr.Checkbox
     hf_token: gr.Textbox
     """
     A data class for Gradio components of the Whisper Parameters. Use "before" Gradio pre-processing.
     This data class is used to mitigate the key-value problem between Gradio components and function parameters.
@@ -131,6 +132,9 @@ class WhisperParameters:
     hf_token: gr.Textbox
         This parameter is related with whisperx. Huggingface token is needed to download diarization models.
         Read more about : https://huggingface.co/pyannote/speaker-diarization-3.1#requirements
     """
     def as_list(self) -> list:
@@ -180,6 +184,7 @@ class WhisperParameters:
             batch_size=args[21],
             is_diarize=args[22],
             hf_token=args[23],
         )
@@ -209,6 +214,7 @@ class WhisperValues:
     batch_size: int
     is_diarize: bool
     hf_token: str
     """
     A data class to use Whisper parameters.
     """

     batch_size: gr.Number
     is_diarize: gr.Checkbox
     hf_token: gr.Textbox
+    diarization_device: gr.Dropdown
     """
     A data class for Gradio components of the Whisper Parameters. Use "before" Gradio pre-processing.
     This data class is used to mitigate the key-value problem between Gradio components and function parameters.
     hf_token: gr.Textbox
         This parameter is related with whisperx. Huggingface token is needed to download diarization models.
         Read more about : https://huggingface.co/pyannote/speaker-diarization-3.1#requirements
+    diarization_device: gr.Dropdown
+        This parameter is related with whisperx. Device to run diarization model
     """
     def as_list(self) -> list:
             batch_size=args[21],
             is_diarize=args[22],
             hf_token=args[23],
+            diarization_device=args[24]
         )
     batch_size: int
     is_diarize: bool
     hf_token: str
+    diarization_device: str
     """
     A data class to use Whisper parameters.
     """