Spaces:

leekwoon
/

Whisper-FastAPI

Sleeping

App Files Files Community

dahyedahye commited on Sep 2, 2024

Commit

3e210b5

1 Parent(s): 02e32e0

Add application file

Browse files

Files changed (40) hide show

Dockerfile +3 -1
main.py → app.py +0 -0
models/models will be saved here.txt +0 -0
modules/__init__.py +0 -0
modules/__pycache__/__init__.cpython-310.pyc +0 -0
modules/diarize/__init__.py +0 -0
modules/diarize/__pycache__/__init__.cpython-310.pyc +0 -0
modules/diarize/__pycache__/diarize_pipeline.cpython-310.pyc +0 -0
modules/diarize/__pycache__/diarizer.cpython-310.pyc +0 -0
modules/diarize/audio_loader.py +0 -179
modules/diarize/diarize_pipeline.py +0 -94
modules/diarize/diarizer.py +0 -132
modules/translation/__init__.py +0 -0
modules/translation/deepl_api.py +0 -201
modules/translation/nllb_inference.py +0 -276
modules/translation/translation_base.py +0 -151
modules/utils/__init__.py +0 -0
modules/utils/__pycache__/__init__.cpython-310.pyc +0 -0
modules/utils/__pycache__/files_manager.cpython-310.pyc +0 -0
modules/utils/__pycache__/subtitle_manager.cpython-310.pyc +0 -0
modules/utils/__pycache__/youtube_manager.cpython-310.pyc +0 -0
modules/utils/files_manager.py +0 -39
modules/utils/subtitle_manager.py +0 -135
modules/utils/youtube_manager.py +0 -15
modules/vad/__init__.py +0 -0
modules/vad/silero_vad.py +0 -264
modules/whisper/__init__.py +0 -0
modules/whisper/__pycache__/__init__.cpython-310.pyc +0 -0
modules/whisper/__pycache__/faster_whisper_inference.cpython-310.pyc +0 -0
modules/whisper/__pycache__/whisper_base.cpython-310.pyc +0 -0
modules/whisper/__pycache__/whisper_factory.cpython-310.pyc +0 -0
modules/whisper/__pycache__/whisper_parameter.cpython-310.pyc +0 -0
modules/whisper/faster_whisper_inference.py +0 -191
modules/whisper/insanely_fast_whisper_inference.py +0 -185
modules/whisper/whisper_Inference.py +0 -101
modules/whisper/whisper_base.py +0 -436
modules/whisper/whisper_factory.py +0 -81
modules/whisper/whisper_parameter.py +0 -277
outputs/outputs are saved here.txt +0 -0
outputs/translations/outputs for translation are saved here.txt +0 -0

Dockerfile CHANGED Viewed

@@ -25,11 +25,13 @@ WORKDIR /Whisper-WebUI
 COPY . .
 COPY --from=builder /Whisper-WebUI/venv /Whisper-WebUI/venv
 VOLUME [ "/Whisper-WebUI/models" ]
 VOLUME [ "/Whisper-WebUI/outputs" ]
 ENV PATH="/Whisper-WebUI/venv/bin:$PATH"
 ENV LD_LIBRARY_PATH=/Whisper-WebUI/venv/lib64/python3.11/site-packages/nvidia/cublas/lib:/Whisper-WebUI/venv/lib64/python3.11/site-packages/nvidia/cudnn/lib
-COPY --chown=user . /app
 CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]

 COPY . .
 COPY --from=builder /Whisper-WebUI/venv /Whisper-WebUI/venv
+# Set permissions
+RUN chown -R 1000:1000 /Whisper-WebUI/models /Whisper-WebUI/outputs
 VOLUME [ "/Whisper-WebUI/models" ]
 VOLUME [ "/Whisper-WebUI/outputs" ]
 ENV PATH="/Whisper-WebUI/venv/bin:$PATH"
 ENV LD_LIBRARY_PATH=/Whisper-WebUI/venv/lib64/python3.11/site-packages/nvidia/cublas/lib:/Whisper-WebUI/venv/lib64/python3.11/site-packages/nvidia/cudnn/lib
 CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]

main.py → app.py RENAMED Viewed

File without changes

models/models will be saved here.txt DELETED Viewed

File without changes

modules/__init__.py DELETED Viewed

File without changes

modules/__pycache__/__init__.cpython-310.pyc DELETED Viewed

Binary file (167 Bytes)

modules/diarize/__init__.py DELETED Viewed

File without changes

modules/diarize/__pycache__/__init__.cpython-310.pyc DELETED Viewed

Binary file (175 Bytes)

modules/diarize/__pycache__/diarize_pipeline.cpython-310.pyc DELETED Viewed

Binary file (3.06 kB)

modules/diarize/__pycache__/diarizer.cpython-310.pyc DELETED Viewed

Binary file (4.14 kB)

modules/diarize/audio_loader.py DELETED Viewed

@@ -1,179 +0,0 @@
-# Adapted from https://github.com/m-bain/whisperX/blob/main/whisperx/audio.py
-import os
-import subprocess
-from functools import lru_cache
-from typing import Optional, Union
-from scipy.io.wavfile import write
-import tempfile
-import numpy as np
-import torch
-import torch.nn.functional as F
-def exact_div(x, y):
-    assert x % y == 0
-    return x // y
-# hard-coded audio hyperparameters
-SAMPLE_RATE = 16000
-N_FFT = 400
-HOP_LENGTH = 160
-CHUNK_LENGTH = 30
-N_SAMPLES = CHUNK_LENGTH * SAMPLE_RATE  # 480000 samples in a 30-second chunk
-N_FRAMES = exact_div(N_SAMPLES, HOP_LENGTH)  # 3000 frames in a mel spectrogram input
-N_SAMPLES_PER_TOKEN = HOP_LENGTH * 2  # the initial convolutions has stride 2
-FRAMES_PER_SECOND = exact_div(SAMPLE_RATE, HOP_LENGTH)  # 10ms per audio frame
-TOKENS_PER_SECOND = exact_div(SAMPLE_RATE, N_SAMPLES_PER_TOKEN)  # 20ms per audio token
-def load_audio(file: Union[str, np.ndarray], sr: int = SAMPLE_RATE) -> np.ndarray:
-    """
-    Open an audio file or process a numpy array containing audio data as mono waveform, resampling as necessary.
-    Parameters
-    ----------
-    file: Union[str, np.ndarray]
-        The audio file to open or a numpy array containing the audio data.
-    sr: int
-        The sample rate to resample the audio if necessary.
-    Returns
-    -------
-    A NumPy array containing the audio waveform, in float32 dtype.
-    """
-    if isinstance(file, np.ndarray):
-        if file.dtype != np.float32:
-            file = file.astype(np.float32)
-        if file.ndim > 1:
-            file = np.mean(file, axis=1)
-        temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
-        write(temp_file.name, SAMPLE_RATE, (file * 32768).astype(np.int16))
-        temp_file_path = temp_file.name
-        temp_file.close()
-    else:
-        temp_file_path = file
-    try:
-        cmd = [
-            "ffmpeg",
-            "-nostdin",
-            "-threads",
-            "0",
-            "-i",
-            temp_file_path,
-            "-f",
-            "s16le",
-            "-ac",
-            "1",
-            "-acodec",
-            "pcm_s16le",
-            "-ar",
-            str(sr),
-            "-",
-        ]
-        out = subprocess.run(cmd, capture_output=True, check=True).stdout
-    except subprocess.CalledProcessError as e:
-        raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
-    finally:
-        if isinstance(file, np.ndarray):
-            os.remove(temp_file_path)
-    return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0
-def pad_or_trim(array, length: int = N_SAMPLES, *, axis: int = -1):
-    """
-    Pad or trim the audio array to N_SAMPLES, as expected by the encoder.
-    """
-    if torch.is_tensor(array):
-        if array.shape[axis] > length:
-            array = array.index_select(
-                dim=axis, index=torch.arange(length, device=array.device)
-            )
-        if array.shape[axis] < length:
-            pad_widths = [(0, 0)] * array.ndim
-            pad_widths[axis] = (0, length - array.shape[axis])
-            array = F.pad(array, [pad for sizes in pad_widths[::-1] for pad in sizes])
-    else:
-        if array.shape[axis] > length:
-            array = array.take(indices=range(length), axis=axis)
-        if array.shape[axis] < length:
-            pad_widths = [(0, 0)] * array.ndim
-            pad_widths[axis] = (0, length - array.shape[axis])
-            array = np.pad(array, pad_widths)
-    return array
-@lru_cache(maxsize=None)
-def mel_filters(device, n_mels: int) -> torch.Tensor:
-    """
-    load the mel filterbank matrix for projecting STFT into a Mel spectrogram.
-    Allows decoupling librosa dependency; saved using:
-        np.savez_compressed(
-            "mel_filters.npz",
-            mel_80=librosa.filters.mel(sr=16000, n_fft=400, n_mels=80),
-        )
-    """
-    assert n_mels in [80, 128], f"Unsupported n_mels: {n_mels}"
-    with np.load(
-        os.path.join(os.path.dirname(__file__), "assets", "mel_filters.npz")
-    ) as f:
-        return torch.from_numpy(f[f"mel_{n_mels}"]).to(device)
-def log_mel_spectrogram(
-    audio: Union[str, np.ndarray, torch.Tensor],
-    n_mels: int,
-    padding: int = 0,
-    device: Optional[Union[str, torch.device]] = None,
-):
-    """
-    Compute the log-Mel spectrogram of
-    Parameters
-    ----------
-    audio: Union[str, np.ndarray, torch.Tensor], shape = (*)
-        The path to audio or either a NumPy array or Tensor containing the audio waveform in 16 kHz
-    n_mels: int
-        The number of Mel-frequency filters, only 80 is supported
-    padding: int
-        Number of zero samples to pad to the right
-    device: Optional[Union[str, torch.device]]
-        If given, the audio tensor is moved to this device before STFT
-    Returns
-    -------
-    torch.Tensor, shape = (80, n_frames)
-        A Tensor that contains the Mel spectrogram
-    """
-    if not torch.is_tensor(audio):
-        if isinstance(audio, str):
-            audio = load_audio(audio)
-        audio = torch.from_numpy(audio)
-    if device is not None:
-        audio = audio.to(device)
-    if padding > 0:
-        audio = F.pad(audio, (0, padding))
-    window = torch.hann_window(N_FFT).to(audio.device)
-    stft = torch.stft(audio, N_FFT, HOP_LENGTH, window=window, return_complex=True)
-    magnitudes = stft[..., :-1].abs() ** 2
-    filters = mel_filters(audio.device, n_mels)
-    mel_spec = filters @ magnitudes
-    log_spec = torch.clamp(mel_spec, min=1e-10).log10()
-    log_spec = torch.maximum(log_spec, log_spec.max() - 8.0)
-    log_spec = (log_spec + 4.0) / 4.0
-    return log_spec

modules/diarize/diarize_pipeline.py DELETED Viewed

@@ -1,94 +0,0 @@
-# Adapted from https://github.com/m-bain/whisperX/blob/main/whisperx/diarize.py
-import numpy as np
-import pandas as pd
-import os
-from pyannote.audio import Pipeline
-from typing import Optional, Union
-import torch
-from modules.diarize.audio_loader import load_audio, SAMPLE_RATE
-class DiarizationPipeline:
-    def __init__(
-        self,
-        model_name="pyannote/speaker-diarization-3.1",
-        cache_dir: str = os.path.join("models", "Diarization"),
-        use_auth_token=None,
-        device: Optional[Union[str, torch.device]] = "cpu",
-    ):
-        if isinstance(device, str):
-            device = torch.device(device)
-        self.model = Pipeline.from_pretrained(
-            model_name,
-            use_auth_token=use_auth_token,
-            cache_dir=cache_dir
-        ).to(device)
-    def __call__(self, audio: Union[str, np.ndarray], min_speakers=None, max_speakers=None):
-        if isinstance(audio, str):
-            audio = load_audio(audio)
-        audio_data = {
-            'waveform': torch.from_numpy(audio[None, :]),
-            'sample_rate': SAMPLE_RATE
-        }
-        segments = self.model(audio_data, min_speakers=min_speakers, max_speakers=max_speakers)
-        diarize_df = pd.DataFrame(segments.itertracks(yield_label=True), columns=['segment', 'label', 'speaker'])
-        diarize_df['start'] = diarize_df['segment'].apply(lambda x: x.start)
-        diarize_df['end'] = diarize_df['segment'].apply(lambda x: x.end)
-        return diarize_df
-def assign_word_speakers(diarize_df, transcript_result, fill_nearest=False):
-    transcript_segments = transcript_result["segments"]
-    for seg in transcript_segments:
-        # assign speaker to segment (if any)
-        diarize_df['intersection'] = np.minimum(diarize_df['end'], seg['end']) - np.maximum(diarize_df['start'],
-                                                                                            seg['start'])
-        diarize_df['union'] = np.maximum(diarize_df['end'], seg['end']) - np.minimum(diarize_df['start'], seg['start'])
-        intersected = diarize_df[diarize_df["intersection"] > 0]
-        speaker = None
-        if len(intersected) > 0:
-            # Choosing most strong intersection
-            speaker = intersected.groupby("speaker")["intersection"].sum().sort_values(ascending=False).index[0]
-        elif fill_nearest:
-            # Otherwise choosing closest
-            speaker = diarize_df.sort_values(by=["intersection"], ascending=False)["speaker"].values[0]
-        if speaker is not None:
-            seg["speaker"] = speaker
-        # assign speaker to words
-        if 'words' in seg:
-            for word in seg['words']:
-                if 'start' in word:
-                    diarize_df['intersection'] = np.minimum(diarize_df['end'], word['end']) - np.maximum(
-                        diarize_df['start'], word['start'])
-                    diarize_df['union'] = np.maximum(diarize_df['end'], word['end']) - np.minimum(diarize_df['start'],
-                                                                                                  word['start'])
-                    intersected = diarize_df[diarize_df["intersection"] > 0]
-                    word_speaker = None
-                    if len(intersected) > 0:
-                        # Choosing most strong intersection
-                        word_speaker = \
-                            intersected.groupby("speaker")["intersection"].sum().sort_values(ascending=False).index[0]
-                    elif fill_nearest:
-                        # Otherwise choosing closest
-                        word_speaker = diarize_df.sort_values(by=["intersection"], ascending=False)["speaker"].values[0]
-                    if word_speaker is not None:
-                        word["speaker"] = word_speaker
-    return transcript_result
-class Segment:
-    def __init__(self, start, end, speaker=None):
-        self.start = start
-        self.end = end
-        self.speaker = speaker

modules/diarize/diarizer.py DELETED Viewed

@@ -1,132 +0,0 @@
-import os
-import torch
-from typing import List, Union, BinaryIO, Optional
-import numpy as np
-import time
-import logging
-from modules.diarize.diarize_pipeline import DiarizationPipeline, assign_word_speakers
-from modules.diarize.audio_loader import load_audio
-class Diarizer:
-    def __init__(self,
-                 model_dir: str = os.path.join("models", "Diarization")
-                 ):
-        self.device = self.get_device()
-        self.available_device = self.get_available_device()
-        self.compute_type = "float16"
-        self.model_dir = model_dir
-        os.makedirs(self.model_dir, exist_ok=True)
-        self.pipe = None
-    def run(self,
-            audio: Union[str, BinaryIO, np.ndarray],
-            transcribed_result: List[dict],
-            use_auth_token: str,
-            device: Optional[str] = None
-            ):
-        """
-        Diarize transcribed result as a post-processing
-        Parameters
-        ----------
-        audio: Union[str, BinaryIO, np.ndarray]
-            Audio input. This can be file path or binary type.
-        transcribed_result: List[dict]
-            transcribed result through whisper.
-        use_auth_token: str
-            Huggingface token with READ permission. This is only needed the first time you download the model.
-            You must manually go to the website https://huggingface.co/pyannote/speaker-diarization-3.1 and agree to their TOS to download the model.
-        device: Optional[str]
-            Device for diarization.
-        Returns
-        ----------
-        segments_result: List[dict]
-            list of dicts that includes start, end timestamps and transcribed text
-        elapsed_time: float
-            elapsed time for running
-        """
-        start_time = time.time()
-        if device is None:
-            device = self.device
-        if device != self.device or self.pipe is None:
-            self.update_pipe(
-                device=device,
-                use_auth_token=use_auth_token
-            )
-        audio = load_audio(audio)
-        diarization_segments = self.pipe(audio)
-        diarized_result = assign_word_speakers(
-            diarization_segments,
-            {"segments": transcribed_result}
-        )
-        for segment in diarized_result["segments"]:
-            speaker = "None"
-            if "speaker" in segment:
-                speaker = segment["speaker"]
-            segment["text"] = speaker + "|" + segment["text"].strip()
-        elapsed_time = time.time() - start_time
-        return diarized_result["segments"], elapsed_time
-    def update_pipe(self,
-                    use_auth_token: str,
-                    device: str
-                    ):
-        """
-        Set pipeline for diarization
-        Parameters
-        ----------
-        use_auth_token: str
-            Huggingface token with READ permission. This is only needed the first time you download the model.
-            You must manually go to the website https://huggingface.co/pyannote/speaker-diarization-3.1 and agree to their TOS to download the model.
-        device: str
-            Device for diarization.
-        """
-        self.device = device
-        os.makedirs(self.model_dir, exist_ok=True)
-        if (not os.listdir(self.model_dir) and
-                not use_auth_token):
-            print(
-                "\nFailed to diarize. You need huggingface token and agree to their requirements to download the diarization model.\n"
-                "Go to \"https://huggingface.co/pyannote/speaker-diarization-3.1\" and follow their instructions to download the model.\n"
-            )
-            return
-        logger = logging.getLogger("speechbrain.utils.train_logger")
-        # Disable redundant torchvision warning message
-        logger.disabled = True
-        self.pipe = DiarizationPipeline(
-            use_auth_token=use_auth_token,
-            device=device,
-            cache_dir=self.model_dir
-        )
-        logger.disabled = False
-    @staticmethod
-    def get_device():
-        if torch.cuda.is_available():
-            return "cuda"
-        elif torch.backends.mps.is_available():
-            return "mps"
-        else:
-            return "cpu"
-    @staticmethod
-    def get_available_device():
-        devices = ["cpu"]
-        if torch.cuda.is_available():
-            devices.append("cuda")
-        elif torch.backends.mps.is_available():
-            devices.append("mps")
-        return devices

modules/translation/__init__.py DELETED Viewed

File without changes

modules/translation/deepl_api.py DELETED Viewed

@@ -1,201 +0,0 @@
-import requests
-import time
-import os
-from datetime import datetime
-import gradio as gr
-from modules.utils.subtitle_manager import *
-"""
-This is written with reference to the DeepL API documentation.
-If you want to know the information of the DeepL API, see here: https://www.deepl.com/docs-api/documents
-"""
-DEEPL_AVAILABLE_TARGET_LANGS = {
-    'Bulgarian': 'BG',
-    'Czech': 'CS',
-    'Danish': 'DA',
-    'German': 'DE',
-    'Greek': 'EL',
-    'English': 'EN',
-    'English (British)': 'EN-GB',
-    'English (American)': 'EN-US',
-    'Spanish': 'ES',
-    'Estonian': 'ET',
-    'Finnish': 'FI',
-    'French': 'FR',
-    'Hungarian': 'HU',
-    'Indonesian': 'ID',
-    'Italian': 'IT',
-    'Japanese': 'JA',
-    'Korean': 'KO',
-    'Lithuanian': 'LT',
-    'Latvian': 'LV',
-    'Norwegian (Bokmål)': 'NB',
-    'Dutch': 'NL',
-    'Polish': 'PL',
-    'Portuguese': 'PT',
-    'Portuguese (Brazilian)': 'PT-BR',
-    'Portuguese (all Portuguese varieties excluding Brazilian Portuguese)': 'PT-PT',
-    'Romanian': 'RO',
-    'Russian': 'RU',
-    'Slovak': 'SK',
-    'Slovenian': 'SL',
-    'Swedish': 'SV',
-    'Turkish': 'TR',
-    'Ukrainian': 'UK',
-    'Chinese (simplified)': 'ZH'
-}
-DEEPL_AVAILABLE_SOURCE_LANGS = {
-    'Automatic Detection': None,
-    'Bulgarian': 'BG',
-    'Czech': 'CS',
-    'Danish': 'DA',
-    'German': 'DE',
-    'Greek': 'EL',
-    'English': 'EN',
-    'Spanish': 'ES',
-    'Estonian': 'ET',
-    'Finnish': 'FI',
-    'French': 'FR',
-    'Hungarian': 'HU',
-    'Indonesian': 'ID',
-    'Italian': 'IT',
-    'Japanese': 'JA',
-    'Korean': 'KO',
-    'Lithuanian': 'LT',
-    'Latvian': 'LV',
-    'Norwegian (Bokmål)': 'NB',
-    'Dutch': 'NL',
-    'Polish': 'PL',
-    'Portuguese (all Portuguese varieties mixed)': 'PT',
-    'Romanian': 'RO',
-    'Russian': 'RU',
-    'Slovak': 'SK',
-    'Slovenian': 'SL',
-    'Swedish': 'SV',
-    'Turkish': 'TR',
-    'Ukrainian': 'UK',
-    'Chinese': 'ZH'
-}
-class DeepLAPI:
-    def __init__(self,
-                 output_dir: str = os.path.join("outputs", "translations")
-                 ):
-        self.api_interval = 1
-        self.max_text_batch_size = 50
-        self.available_target_langs = DEEPL_AVAILABLE_TARGET_LANGS
-        self.available_source_langs = DEEPL_AVAILABLE_SOURCE_LANGS
-        self.output_dir = output_dir
-    def translate_deepl(self,
-                        auth_key: str,
-                        fileobjs: list,
-                        source_lang: str,
-                        target_lang: str,
-                        is_pro: bool,
-                        add_timestamp: bool,
-                        progress=gr.Progress()) -> list:
-        """
-        Translate subtitle files using DeepL API
-        Parameters
-        ----------
-        auth_key: str
-            API Key for DeepL from gr.Textbox()
-        fileobjs: list
-            List of files to transcribe from gr.Files()
-        source_lang: str
-            Source language of the file to transcribe from gr.Dropdown()
-        target_lang: str
-            Target language of the file to transcribe from gr.Dropdown()
-        is_pro: str
-            Boolean value that is about pro user or not from gr.Checkbox().
-        add_timestamp: bool
-            Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename.
-        progress: gr.Progress
-            Indicator to show progress directly in gradio.
-        Returns
-        ----------
-        A List of
-        String to return to gr.Textbox()
-        Files to return to gr.Files()
-        """
-        files_info = {}
-        for fileobj in fileobjs:
-            file_path = fileobj.name
-            file_name, file_ext = os.path.splitext(os.path.basename(fileobj.name))
-            if file_ext == ".srt":
-                parsed_dicts = parse_srt(file_path=file_path)
-                batch_size = self.max_text_batch_size
-                for batch_start in range(0, len(parsed_dicts), batch_size):
-                    batch_end = min(batch_start + batch_size, len(parsed_dicts))
-                    sentences_to_translate = [dic["sentence"] for dic in parsed_dicts[batch_start:batch_end]]
-                    translated_texts = self.request_deepl_translate(auth_key, sentences_to_translate, source_lang,
-                                                                    target_lang, is_pro)
-                    for i, translated_text in enumerate(translated_texts):
-                        parsed_dicts[batch_start + i]["sentence"] = translated_text["text"]
-                    progress(batch_end / len(parsed_dicts), desc="Translating..")
-                subtitle = get_serialized_srt(parsed_dicts)
-            elif file_ext == ".vtt":
-                parsed_dicts = parse_vtt(file_path=file_path)
-                batch_size = self.max_text_batch_size
-                for batch_start in range(0, len(parsed_dicts), batch_size):
-                    batch_end = min(batch_start + batch_size, len(parsed_dicts))
-                    sentences_to_translate = [dic["sentence"] for dic in parsed_dicts[batch_start:batch_end]]
-                    translated_texts = self.request_deepl_translate(auth_key, sentences_to_translate, source_lang,
-                                                                    target_lang, is_pro)
-                    for i, translated_text in enumerate(translated_texts):
-                        parsed_dicts[batch_start + i]["sentence"] = translated_text["text"]
-                    progress(batch_end / len(parsed_dicts), desc="Translating..")
-                subtitle = get_serialized_vtt(parsed_dicts)
-            if add_timestamp:
-                timestamp = datetime.now().strftime("%m%d%H%M%S")
-                file_name += f"-{timestamp}"
-            output_path = os.path.join(self.output_dir, f"{file_name}{file_ext}")
-            write_file(subtitle, output_path)
-            files_info[file_name] = {"subtitle": subtitle, "path": output_path}
-        total_result = ''
-        for file_name, info in files_info.items():
-            total_result += '------------------------------------\n'
-            total_result += f'{file_name}\n\n'
-            total_result += f'{info["subtitle"]}'
-        gr_str = f"Done! Subtitle is in the outputs/translation folder.\n\n{total_result}"
-        output_file_paths = [item["path"] for key, item in files_info.items()]
-        return [gr_str, output_file_paths]
-    def request_deepl_translate(self,
-                                auth_key: str,
-                                text: list,
-                                source_lang: str,
-                                target_lang: str,
-                                is_pro: bool):
-        """Request API response to DeepL server"""
-        url = 'https://api.deepl.com/v2/translate' if is_pro else 'https://api-free.deepl.com/v2/translate'
-        headers = {
-            'Authorization': f'DeepL-Auth-Key {auth_key}'
-        }
-        data = {
-            'text': text,
-            'source_lang': DEEPL_AVAILABLE_SOURCE_LANGS[source_lang],
-            'target_lang': DEEPL_AVAILABLE_TARGET_LANGS[target_lang]
-        }
-        response = requests.post(url, headers=headers, data=data).json()
-        time.sleep(self.api_interval)
-        return response["translations"]

modules/translation/nllb_inference.py DELETED Viewed

@@ -1,276 +0,0 @@
-from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
-import gradio as gr
-import os
-from modules.translation.translation_base import TranslationBase
-class NLLBInference(TranslationBase):
-    def __init__(self,
-                 model_dir: str = os.path.join("models", "NLLB"),
-                 output_dir: str = os.path.join("outputs", "translations")
-                 ):
-        super().__init__(
-            model_dir=model_dir,
-            output_dir=output_dir
-        )
-        self.tokenizer = None
-        self.available_models = ["facebook/nllb-200-3.3B", "facebook/nllb-200-1.3B", "facebook/nllb-200-distilled-600M"]
-        self.available_source_langs = list(NLLB_AVAILABLE_LANGS.keys())
-        self.available_target_langs = list(NLLB_AVAILABLE_LANGS.keys())
-        self.pipeline = None
-    def translate(self,
-                  text: str,
-                  max_length: int
-                  ):
-        result = self.pipeline(
-            text,
-            max_length=max_length
-        )
-        return result[0]['translation_text']
-    def update_model(self,
-                     model_size: str,
-                     src_lang: str,
-                     tgt_lang: str,
-                     progress: gr.Progress
-                     ):
-        if model_size != self.current_model_size or self.model is None:
-            print("\nInitializing NLLB Model..\n")
-            progress(0, desc="Initializing NLLB Model..")
-            self.current_model_size = model_size
-            local_files_only = self.is_model_exists(self.current_model_size)
-            self.model = AutoModelForSeq2SeqLM.from_pretrained(pretrained_model_name_or_path=model_size,
-                                                               cache_dir=self.model_dir,
-                                                               local_files_only=local_files_only)
-            self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_size,
-                                                           cache_dir=os.path.join(self.model_dir, "tokenizers"),
-                                                           local_files_only=local_files_only)
-        src_lang = NLLB_AVAILABLE_LANGS[src_lang]
-        tgt_lang = NLLB_AVAILABLE_LANGS[tgt_lang]
-        self.pipeline = pipeline("translation",
-                                 model=self.model,
-                                 tokenizer=self.tokenizer,
-                                 src_lang=src_lang,
-                                 tgt_lang=tgt_lang,
-                                 device=self.device)
-    def is_model_exists(self,
-                        model_size: str):
-        """Check if model exists or not (Only facebook model)"""
-        prefix = "models--facebook--"
-        _id, model_size_name = model_size.split("/")
-        model_dir_name = prefix + model_size_name
-        model_dir_path = os.path.join(self.model_dir, model_dir_name)
-        if os.path.exists(model_dir_path) and os.listdir(model_dir_path):
-            return True
-        return False
-NLLB_AVAILABLE_LANGS = {
-    "Acehnese (Arabic script)": "ace_Arab",
-    "Acehnese (Latin script)": "ace_Latn",
-    "Mesopotamian Arabic": "acm_Arab",
-    "Ta’izzi-Adeni Arabic": "acq_Arab",
-    "Tunisian Arabic": "aeb_Arab",
-    "Afrikaans": "afr_Latn",
-    "South Levantine Arabic": "ajp_Arab",
-    "Akan": "aka_Latn",
-    "Amharic": "amh_Ethi",
-    "North Levantine Arabic": "apc_Arab",
-    "Modern Standard Arabic": "arb_Arab",
-    "Modern Standard Arabic (Romanized)": "arb_Latn",
-    "Najdi Arabic": "ars_Arab",
-    "Moroccan Arabic": "ary_Arab",
-    "Egyptian Arabic": "arz_Arab",
-    "Assamese": "asm_Beng",
-    "Asturian": "ast_Latn",
-    "Awadhi": "awa_Deva",
-    "Central Aymara": "ayr_Latn",
-    "South Azerbaijani": "azb_Arab",
-    "North Azerbaijani": "azj_Latn",
-    "Bashkir": "bak_Cyrl",
-    "Bambara": "bam_Latn",
-    "Balinese": "ban_Latn",
-    "Belarusian": "bel_Cyrl",
-    "Bemba": "bem_Latn",
-    "Bengali": "ben_Beng",
-    "Bhojpuri": "bho_Deva",
-    "Banjar (Arabic script)": "bjn_Arab",
-    "Banjar (Latin script)": "bjn_Latn",
-    "Standard Tibetan": "bod_Tibt",
-    "Bosnian": "bos_Latn",
-    "Buginese": "bug_Latn",
-    "Bulgarian": "bul_Cyrl",
-    "Catalan": "cat_Latn",
-    "Cebuano": "ceb_Latn",
-    "Czech": "ces_Latn",
-    "Chokwe": "cjk_Latn",
-    "Central Kurdish": "ckb_Arab",
-    "Crimean Tatar": "crh_Latn",
-    "Welsh": "cym_Latn",
-    "Danish": "dan_Latn",
-    "German": "deu_Latn",
-    "Southwestern Dinka": "dik_Latn",
-    "Dyula": "dyu_Latn",
-    "Dzongkha": "dzo_Tibt",
-    "Greek": "ell_Grek",
-    "English": "eng_Latn",
-    "Esperanto": "epo_Latn",
-    "Estonian": "est_Latn",
-    "Basque": "eus_Latn",
-    "Ewe": "ewe_Latn",
-    "Faroese": "fao_Latn",
-    "Fijian": "fij_Latn",
-    "Finnish": "fin_Latn",
-    "Fon": "fon_Latn",
-    "French": "fra_Latn",
-    "Friulian": "fur_Latn",
-    "Nigerian Fulfulde": "fuv_Latn",
-    "Scottish Gaelic": "gla_Latn",
-    "Irish": "gle_Latn",
-    "Galician": "glg_Latn",
-    "Guarani": "grn_Latn",
-    "Gujarati": "guj_Gujr",
-    "Haitian Creole": "hat_Latn",
-    "Hausa": "hau_Latn",
-    "Hebrew": "heb_Hebr",
-    "Hindi": "hin_Deva",
-    "Chhattisgarhi": "hne_Deva",
-    "Croatian": "hrv_Latn",
-    "Hungarian": "hun_Latn",
-    "Armenian": "hye_Armn",
-    "Igbo": "ibo_Latn",
-    "Ilocano": "ilo_Latn",
-    "Indonesian": "ind_Latn",
-    "Icelandic": "isl_Latn",
-    "Italian": "ita_Latn",
-    "Javanese": "jav_Latn",
-    "Japanese": "jpn_Jpan",
-    "Kabyle": "kab_Latn",
-    "Jingpho": "kac_Latn",
-    "Kamba": "kam_Latn",
-    "Kannada": "kan_Knda",
-    "Kashmiri (Arabic script)": "kas_Arab",
-    "Kashmiri (Devanagari script)": "kas_Deva",
-    "Georgian": "kat_Geor",
-    "Central Kanuri (Arabic script)": "knc_Arab",
-    "Central Kanuri (Latin script)": "knc_Latn",
-    "Kazakh": "kaz_Cyrl",
-    "Kabiyè": "kbp_Latn",
-    "Kabuverdianu": "kea_Latn",
-    "Khmer": "khm_Khmr",
-    "Kikuyu": "kik_Latn",
-    "Kinyarwanda": "kin_Latn",
-    "Kyrgyz": "kir_Cyrl",
-    "Kimbundu": "kmb_Latn",
-    "Northern Kurdish": "kmr_Latn",
-    "Kikongo": "kon_Latn",
-    "Korean": "kor_Hang",
-    "Lao": "lao_Laoo",
-    "Ligurian": "lij_Latn",
-    "Limburgish": "lim_Latn",
-    "Lingala": "lin_Latn",
-    "Lithuanian": "lit_Latn",
-    "Lombard": "lmo_Latn",
-    "Latgalian": "ltg_Latn",
-    "Luxembourgish": "ltz_Latn",
-    "Luba-Kasai": "lua_Latn",
-    "Ganda": "lug_Latn",
-    "Luo": "luo_Latn",
-    "Mizo": "lus_Latn",
-    "Standard Latvian": "lvs_Latn",
-    "Magahi": "mag_Deva",
-    "Maithili": "mai_Deva",
-    "Malayalam": "mal_Mlym",
-    "Marathi": "mar_Deva",
-    "Minangkabau (Arabic script)": "min_Arab",
-    "Minangkabau (Latin script)": "min_Latn",
-    "Macedonian": "mkd_Cyrl",
-    "Plateau Malagasy": "plt_Latn",
-    "Maltese": "mlt_Latn",
-    "Meitei (Bengali script)": "mni_Beng",
-    "Halh Mongolian": "khk_Cyrl",
-    "Mossi": "mos_Latn",
-    "Maori": "mri_Latn",
-    "Burmese": "mya_Mymr",
-    "Dutch": "nld_Latn",
-    "Norwegian Nynorsk": "nno_Latn",
-    "Norwegian Bokmål": "nob_Latn",
-    "Nepali": "npi_Deva",
-    "Northern Sotho": "nso_Latn",
-    "Nuer": "nus_Latn",
-    "Nyanja": "nya_Latn",
-    "Occitan": "oci_Latn",
-    "West Central Oromo": "gaz_Latn",
-    "Odia": "ory_Orya",
-    "Pangasinan": "pag_Latn",
-    "Eastern Panjabi": "pan_Guru",
-    "Papiamento": "pap_Latn",
-    "Western Persian": "pes_Arab",
-    "Polish": "pol_Latn",
-    "Portuguese": "por_Latn",
-    "Dari": "prs_Arab",
-    "Southern Pashto": "pbt_Arab",
-    "Ayacucho Quechua": "quy_Latn",
-    "Romanian": "ron_Latn",
-    "Rundi": "run_Latn",
-    "Russian": "rus_Cyrl",
-    "Sango": "sag_Latn",
-    "Sanskrit": "san_Deva",
-    "Santali": "sat_Olck",
-    "Sicilian": "scn_Latn",
-    "Shan": "shn_Mymr",
-    "Sinhala": "sin_Sinh",
-    "Slovak": "slk_Latn",
-    "Slovenian": "slv_Latn",
-    "Samoan": "smo_Latn",
-    "Shona": "sna_Latn",
-    "Sindhi": "snd_Arab",
-    "Somali": "som_Latn",
-    "Southern Sotho": "sot_Latn",
-    "Spanish": "spa_Latn",
-    "Tosk Albanian": "als_Latn",
-    "Sardinian": "srd_Latn",
-    "Serbian": "srp_Cyrl",
-    "Swati": "ssw_Latn",
-    "Sundanese": "sun_Latn",
-    "Swedish": "swe_Latn",
-    "Swahili": "swh_Latn",
-    "Silesian": "szl_Latn",
-    "Tamil": "tam_Taml",
-    "Tatar": "tat_Cyrl",
-    "Telugu": "tel_Telu",
-    "Tajik": "tgk_Cyrl",
-    "Tagalog": "tgl_Latn",
-    "Thai": "tha_Thai",
-    "Tigrinya": "tir_Ethi",
-    "Tamasheq (Latin script)": "taq_Latn",
-    "Tamasheq (Tifinagh script)": "taq_Tfng",
-    "Tok Pisin": "tpi_Latn",
-    "Tswana": "tsn_Latn",
-    "Tsonga": "tso_Latn",
-    "Turkmen": "tuk_Latn",
-    "Tumbuka": "tum_Latn",
-    "Turkish": "tur_Latn",
-    "Twi": "twi_Latn",
-    "Central Atlas Tamazight": "tzm_Tfng",
-    "Uyghur": "uig_Arab",
-    "Ukrainian": "ukr_Cyrl",
-    "Umbundu": "umb_Latn",
-    "Urdu": "urd_Arab",
-    "Northern Uzbek": "uzn_Latn",
-    "Venetian": "vec_Latn",
-    "Vietnamese": "vie_Latn",
-    "Waray": "war_Latn",
-    "Wolof": "wol_Latn",
-    "Xhosa": "xho_Latn",
-    "Eastern Yiddish": "ydd_Hebr",
-    "Yoruba": "yor_Latn",
-    "Yue Chinese": "yue_Hant",
-    "Chinese (Simplified)": "zho_Hans",
-    "Chinese (Traditional)": "zho_Hant",
-    "Standard Malay": "zsm_Latn",
-    "Zulu": "zul_Latn",
-}

modules/translation/translation_base.py DELETED Viewed

@@ -1,151 +0,0 @@
-import os
-import torch
-import gradio as gr
-from abc import ABC, abstractmethod
-from typing import List
-from datetime import datetime
-from modules.whisper.whisper_parameter import *
-from modules.utils.subtitle_manager import *
-class TranslationBase(ABC):
-    def __init__(self,
-                 model_dir: str = os.path.join("models", "NLLB"),
-                 output_dir: str = os.path.join("outputs", "translations")
-                 ):
-        super().__init__()
-        self.model = None
-        self.model_dir = model_dir
-        self.output_dir = output_dir
-        os.makedirs(self.model_dir, exist_ok=True)
-        os.makedirs(self.output_dir, exist_ok=True)
-        self.current_model_size = None
-        self.device = self.get_device()
-    @abstractmethod
-    def translate(self,
-                  text: str,
-                  max_length: int
-                  ):
-        pass
-    @abstractmethod
-    def update_model(self,
-                     model_size: str,
-                     src_lang: str,
-                     tgt_lang: str,
-                     progress: gr.Progress
-                     ):
-        pass
-    def translate_file(self,
-                       fileobjs: list,
-                       model_size: str,
-                       src_lang: str,
-                       tgt_lang: str,
-                       max_length: int,
-                       add_timestamp: bool,
-                       progress=gr.Progress()) -> list:
-        """
-        Translate subtitle file from source language to target language
-        Parameters
-        ----------
-        fileobjs: list
-            List of files to transcribe from gr.Files()
-        model_size: str
-            Whisper model size from gr.Dropdown()
-        src_lang: str
-            Source language of the file to translate from gr.Dropdown()
-        tgt_lang: str
-            Target language of the file to translate from gr.Dropdown()
-        max_length: int
-            Max length per line to translate
-        add_timestamp: bool
-            Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename.
-        progress: gr.Progress
-            Indicator to show progress directly in gradio.
-            I use a forked version of whisper for this. To see more info : https://github.com/jhj0517/jhj0517-whisper/tree/add-progress-callback
-        Returns
-        ----------
-        A List of
-        String to return to gr.Textbox()
-        Files to return to gr.Files()
-        """
-        try:
-            self.update_model(model_size=model_size,
-                              src_lang=src_lang,
-                              tgt_lang=tgt_lang,
-                              progress=progress)
-            files_info = {}
-            for fileobj in fileobjs:
-                file_path = fileobj.name
-                file_name, file_ext = os.path.splitext(os.path.basename(fileobj.name))
-                if file_ext == ".srt":
-                    parsed_dicts = parse_srt(file_path=file_path)
-                    total_progress = len(parsed_dicts)
-                    for index, dic in enumerate(parsed_dicts):
-                        progress(index / total_progress, desc="Translating..")
-                        translated_text = self.translate(dic["sentence"], max_length=max_length)
-                        dic["sentence"] = translated_text
-                    subtitle = get_serialized_srt(parsed_dicts)
-                elif file_ext == ".vtt":
-                    parsed_dicts = parse_vtt(file_path=file_path)
-                    total_progress = len(parsed_dicts)
-                    for index, dic in enumerate(parsed_dicts):
-                        progress(index / total_progress, desc="Translating..")
-                        translated_text = self.translate(dic["sentence"], max_length=max_length)
-                        dic["sentence"] = translated_text
-                    subtitle = get_serialized_vtt(parsed_dicts)
-                if add_timestamp:
-                    timestamp = datetime.now().strftime("%m%d%H%M%S")
-                    file_name += f"-{timestamp}"
-                output_path = os.path.join(self.output_dir, f"{file_name}{file_ext}")
-                write_file(subtitle, output_path)
-                files_info[file_name] = {"subtitle": subtitle, "path": output_path}
-            total_result = ''
-            for file_name, info in files_info.items():
-                total_result += '------------------------------------\n'
-                total_result += f'{file_name}\n\n'
-                total_result += f'{info["subtitle"]}'
-            gr_str = f"Done! Subtitle is in the outputs/translation folder.\n\n{total_result}"
-            output_file_paths = [item["path"] for key, item in files_info.items()]
-            return [gr_str, output_file_paths]
-        except Exception as e:
-            print(f"Error: {str(e)}")
-        finally:
-            self.release_cuda_memory()
-    @staticmethod
-    def get_device():
-        if torch.cuda.is_available():
-            return "cuda"
-        elif torch.backends.mps.is_available():
-            return "mps"
-        else:
-            return "cpu"
-    @staticmethod
-    def release_cuda_memory():
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
-            torch.cuda.reset_max_memory_allocated()
-    @staticmethod
-    def remove_input_files(file_paths: List[str]):
-        if not file_paths:
-            return
-        for file_path in file_paths:
-            if file_path and os.path.exists(file_path):
-                os.remove(file_path)

modules/utils/__init__.py DELETED Viewed

File without changes

modules/utils/__pycache__/__init__.cpython-310.pyc DELETED Viewed

Binary file (173 Bytes)

modules/utils/__pycache__/files_manager.cpython-310.pyc DELETED Viewed

Binary file (1.43 kB)

modules/utils/__pycache__/subtitle_manager.cpython-310.pyc DELETED Viewed

Binary file (3.38 kB)

modules/utils/__pycache__/youtube_manager.cpython-310.pyc DELETED Viewed

Binary file (748 Bytes)

modules/utils/files_manager.py DELETED Viewed

@@ -1,39 +0,0 @@
-import os
-import fnmatch
-from gradio.utils import NamedString
-def get_media_files(folder_path, include_sub_directory=False):
-    video_extensions = ['*.mp4', '*.mkv', '*.flv', '*.avi', '*.mov', '*.wmv']
-    audio_extensions = ['*.mp3', '*.wav', '*.aac', '*.flac', '*.ogg', '*.m4a']
-    media_extensions = video_extensions + audio_extensions
-    media_files = []
-    if include_sub_directory:
-        for root, _, files in os.walk(folder_path):
-            for extension in media_extensions:
-                media_files.extend(
-                    os.path.join(root, file) for file in fnmatch.filter(files, extension)
-                    if os.path.exists(os.path.join(root, file))
-                )
-    else:
-        for extension in media_extensions:
-            media_files.extend(
-                os.path.join(folder_path, file) for file in fnmatch.filter(os.listdir(folder_path), extension)
-                if os.path.isfile(os.path.join(folder_path, file)) and os.path.exists(os.path.join(folder_path, file))
-            )
-    return media_files
-def format_gradio_files(files: list):
-    if not files:
-        return files
-    gradio_files = []
-    for file in files:
-        gradio_files.append(NamedString(file))
-    return gradio_files

modules/utils/subtitle_manager.py DELETED Viewed

@@ -1,135 +0,0 @@
-import re
-def timeformat_srt(time):
-    hours = time // 3600
-    minutes = (time - hours * 3600) // 60
-    seconds = time - hours * 3600 - minutes * 60
-    milliseconds = (time - int(time)) * 1000
-    return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d},{int(milliseconds):03d}"
-def timeformat_vtt(time):
-    hours = time // 3600
-    minutes = (time - hours * 3600) // 60
-    seconds = time - hours * 3600 - minutes * 60
-    milliseconds = (time - int(time)) * 1000
-    return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d}.{int(milliseconds):03d}"
-def write_file(subtitle, output_file):
-    with open(output_file, 'w', encoding='utf-8') as f:
-        f.write(subtitle)
-def get_srt(segments):
-    output = ""
-    for i, segment in enumerate(segments):
-        output += f"{i + 1}\n"
-        output += f"{timeformat_srt(segment['start'])} --> {timeformat_srt(segment['end'])}\n"
-        if segment['text'].startswith(' '):
-            segment['text'] = segment['text'][1:]
-        output += f"{segment['text']}\n\n"
-    return output
-def get_vtt(segments):
-    output = "WebVTT\n\n"
-    for i, segment in enumerate(segments):
-        output += f"{i + 1}\n"
-        output += f"{timeformat_vtt(segment['start'])} --> {timeformat_vtt(segment['end'])}\n"
-        if segment['text'].startswith(' '):
-            segment['text'] = segment['text'][1:]
-        output += f"{segment['text']}\n\n"
-    return output
-def get_txt(segments):
-    output = ""
-    for i, segment in enumerate(segments):
-        if segment['text'].startswith(' '):
-            segment['text'] = segment['text'][1:]
-        output += f"{segment['text']}\n"
-    return output
-def parse_srt(file_path):
-    """Reads SRT file and returns as dict"""
-    with open(file_path, 'r', encoding='utf-8') as file:
-        srt_data = file.read()
-    data = []
-    blocks = srt_data.split('\n\n')
-    for block in blocks:
-        if block.strip() != '':
-            lines = block.strip().split('\n')
-            index = lines[0]
-            timestamp = lines[1]
-            sentence = ' '.join(lines[2:])
-            data.append({
-                "index": index,
-                "timestamp": timestamp,
-                "sentence": sentence
-            })
-    return data
-def parse_vtt(file_path):
-    """Reads WebVTT file and returns as dict"""
-    with open(file_path, 'r', encoding='utf-8') as file:
-        webvtt_data = file.read()
-    data = []
-    blocks = webvtt_data.split('\n\n')
-    for block in blocks:
-        if block.strip() != '' and not block.strip().startswith("WebVTT"):
-            lines = block.strip().split('\n')
-            index = lines[0]
-            timestamp = lines[1]
-            sentence = ' '.join(lines[2:])
-            data.append({
-                "index": index,
-                "timestamp": timestamp,
-                "sentence": sentence
-            })
-    return data
-def get_serialized_srt(dicts):
-    output = ""
-    for dic in dicts:
-        output += f'{dic["index"]}\n'
-        output += f'{dic["timestamp"]}\n'
-        output += f'{dic["sentence"]}\n\n'
-    return output
-def get_serialized_vtt(dicts):
-    output = "WebVTT\n\n"
-    for dic in dicts:
-        output += f'{dic["index"]}\n'
-        output += f'{dic["timestamp"]}\n'
-        output += f'{dic["sentence"]}\n\n'
-    return output
-def safe_filename(name):
-    from app import _args
-    INVALID_FILENAME_CHARS = r'[<>:"/\\|?*\x00-\x1f]'
-    safe_name = re.sub(INVALID_FILENAME_CHARS, '_', name)
-    if not _args.colab:
-        return safe_name
-    # Truncate the filename if it exceeds the max_length (20)
-    if len(safe_name) > 20:
-        file_extension = safe_name.split('.')[-1]
-        if len(file_extension) + 1 < 20:
-            truncated_name = safe_name[:20 - len(file_extension) - 1]
-            safe_name = truncated_name + '.' + file_extension
-        else:
-            safe_name = safe_name[:20]
-    return safe_name

modules/utils/youtube_manager.py DELETED Viewed

@@ -1,15 +0,0 @@
-from pytubefix import YouTube
-import os
-def get_ytdata(link):
-    return YouTube(link)
-def get_ytmetas(link):
-    yt = YouTube(link)
-    return yt.thumbnail_url, yt.title, yt.description
-def get_ytaudio(ytdata: YouTube):
-    return ytdata.streams.get_audio_only().download(filename=os.path.join("modules", "yt_tmp.wav"))

modules/vad/__init__.py DELETED Viewed

File without changes

modules/vad/silero_vad.py DELETED Viewed

@@ -1,264 +0,0 @@
-# Adapted from https://github.com/SYSTRAN/faster-whisper/blob/master/faster_whisper/vad.py
-from faster_whisper.vad import VadOptions, get_vad_model
-import numpy as np
-from typing import BinaryIO, Union, List, Optional, Tuple
-import warnings
-import faster_whisper
-from faster_whisper.transcribe import SpeechTimestampsMap, Segment
-import gradio as gr
-class SileroVAD:
-    def __init__(self):
-        self.sampling_rate = 16000
-        self.window_size_samples = 512
-        self.model = None
-    def run(self,
-            audio: Union[str, BinaryIO, np.ndarray],
-            vad_parameters: VadOptions,
-            progress: gr.Progress = gr.Progress()
-            ) -> Tuple[np.ndarray, List[dict]]:
-        """
-        Run VAD
-        Parameters
-        ----------
-        audio: Union[str, BinaryIO, np.ndarray]
-            Audio path or file binary or Audio numpy array
-        vad_parameters:
-            Options for VAD processing.
-        progress: gr.Progress
-            Indicator to show progress directly in gradio.
-        Returns
-        ----------
-        np.ndarray
-            Pre-processed audio with VAD
-        List[dict]
-            Chunks of speeches to be used to restore the timestamps later
-        """
-        sampling_rate = self.sampling_rate
-        if not isinstance(audio, np.ndarray):
-            audio = faster_whisper.decode_audio(audio, sampling_rate=sampling_rate)
-        duration = audio.shape[0] / sampling_rate
-        duration_after_vad = duration
-        if vad_parameters is None:
-            vad_parameters = VadOptions()
-        elif isinstance(vad_parameters, dict):
-            vad_parameters = VadOptions(**vad_parameters)
-        speech_chunks = self.get_speech_timestamps(
-            audio=audio,
-            vad_options=vad_parameters,
-            progress=progress
-        )
-        audio = self.collect_chunks(audio, speech_chunks)
-        duration_after_vad = audio.shape[0] / sampling_rate
-        return audio, speech_chunks
-    def get_speech_timestamps(
-        self,
-        audio: np.ndarray,
-        vad_options: Optional[VadOptions] = None,
-        progress: gr.Progress = gr.Progress(),
-        **kwargs,
-    ) -> List[dict]:
-        """This method is used for splitting long audios into speech chunks using silero VAD.
-        Args:
-          audio: One dimensional float array.
-          vad_options: Options for VAD processing.
-          kwargs: VAD options passed as keyword arguments for backward compatibility.
-          progress: Gradio progress to indicate progress.
-        Returns:
-          List of dicts containing begin and end samples of each speech chunk.
-        """
-        if self.model is None:
-            self.update_model()
-        if vad_options is None:
-            vad_options = VadOptions(**kwargs)
-        threshold = vad_options.threshold
-        min_speech_duration_ms = vad_options.min_speech_duration_ms
-        max_speech_duration_s = vad_options.max_speech_duration_s
-        min_silence_duration_ms = vad_options.min_silence_duration_ms
-        window_size_samples = self.window_size_samples
-        speech_pad_ms = vad_options.speech_pad_ms
-        sampling_rate = 16000
-        min_speech_samples = sampling_rate * min_speech_duration_ms / 1000
-        speech_pad_samples = sampling_rate * speech_pad_ms / 1000
-        max_speech_samples = (
-                sampling_rate * max_speech_duration_s
-                - window_size_samples
-                - 2 * speech_pad_samples
-        )
-        min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
-        min_silence_samples_at_max_speech = sampling_rate * 98 / 1000
-        audio_length_samples = len(audio)
-        state, context = self.model.get_initial_states(batch_size=1)
-        speech_probs = []
-        for current_start_sample in range(0, audio_length_samples, window_size_samples):
-            progress(current_start_sample/audio_length_samples, desc="Detecting speeches only using VAD...")
-            chunk = audio[current_start_sample: current_start_sample + window_size_samples]
-            if len(chunk) < window_size_samples:
-                chunk = np.pad(chunk, (0, int(window_size_samples - len(chunk))))
-            speech_prob, state, context = self.model(chunk, state, context, sampling_rate)
-            speech_probs.append(speech_prob)
-        triggered = False
-        speeches = []
-        current_speech = {}
-        neg_threshold = threshold - 0.15
-        # to save potential segment end (and tolerate some silence)
-        temp_end = 0
-        # to save potential segment limits in case of maximum segment size reached
-        prev_end = next_start = 0
-        for i, speech_prob in enumerate(speech_probs):
-            if (speech_prob >= threshold) and temp_end:
-                temp_end = 0
-                if next_start < prev_end:
-                    next_start = window_size_samples * i
-            if (speech_prob >= threshold) and not triggered:
-                triggered = True
-                current_speech["start"] = window_size_samples * i
-                continue
-            if (
-                    triggered
-                    and (window_size_samples * i) - current_speech["start"] > max_speech_samples
-            ):
-                if prev_end:
-                    current_speech["end"] = prev_end
-                    speeches.append(current_speech)
-                    current_speech = {}
-                    # previously reached silence (< neg_thres) and is still not speech (< thres)
-                    if next_start < prev_end:
-                        triggered = False
-                    else:
-                        current_speech["start"] = next_start
-                    prev_end = next_start = temp_end = 0
-                else:
-                    current_speech["end"] = window_size_samples * i
-                    speeches.append(current_speech)
-                    current_speech = {}
-                    prev_end = next_start = temp_end = 0
-                    triggered = False
-                    continue
-            if (speech_prob < neg_threshold) and triggered:
-                if not temp_end:
-                    temp_end = window_size_samples * i
-                # condition to avoid cutting in very short silence
-                if (window_size_samples * i) - temp_end > min_silence_samples_at_max_speech:
-                    prev_end = temp_end
-                if (window_size_samples * i) - temp_end < min_silence_samples:
-                    continue
-                else:
-                    current_speech["end"] = temp_end
-                    if (
-                            current_speech["end"] - current_speech["start"]
-                    ) > min_speech_samples:
-                        speeches.append(current_speech)
-                    current_speech = {}
-                    prev_end = next_start = temp_end = 0
-                    triggered = False
-                    continue
-        if (
-                current_speech
-                and (audio_length_samples - current_speech["start"]) > min_speech_samples
-        ):
-            current_speech["end"] = audio_length_samples
-            speeches.append(current_speech)
-        for i, speech in enumerate(speeches):
-            if i == 0:
-                speech["start"] = int(max(0, speech["start"] - speech_pad_samples))
-            if i != len(speeches) - 1:
-                silence_duration = speeches[i + 1]["start"] - speech["end"]
-                if silence_duration < 2 * speech_pad_samples:
-                    speech["end"] += int(silence_duration // 2)
-                    speeches[i + 1]["start"] = int(
-                        max(0, speeches[i + 1]["start"] - silence_duration // 2)
-                    )
-                else:
-                    speech["end"] = int(
-                        min(audio_length_samples, speech["end"] + speech_pad_samples)
-                    )
-                    speeches[i + 1]["start"] = int(
-                        max(0, speeches[i + 1]["start"] - speech_pad_samples)
-                    )
-            else:
-                speech["end"] = int(
-                    min(audio_length_samples, speech["end"] + speech_pad_samples)
-                )
-        return speeches
-    def update_model(self):
-        self.model = get_vad_model()
-    @staticmethod
-    def collect_chunks(audio: np.ndarray, chunks: List[dict]) -> np.ndarray:
-        """Collects and concatenates audio chunks."""
-        if not chunks:
-            return np.array([], dtype=np.float32)
-        return np.concatenate([audio[chunk["start"]: chunk["end"]] for chunk in chunks])
-    @staticmethod
-    def format_timestamp(
-        seconds: float,
-        always_include_hours: bool = False,
-        decimal_marker: str = ".",
-    ) -> str:
-        assert seconds >= 0, "non-negative timestamp expected"
-        milliseconds = round(seconds * 1000.0)
-        hours = milliseconds // 3_600_000
-        milliseconds -= hours * 3_600_000
-        minutes = milliseconds // 60_000
-        milliseconds -= minutes * 60_000
-        seconds = milliseconds // 1_000
-        milliseconds -= seconds * 1_000
-        hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else ""
-        return (
-            f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}"
-        )
-    def restore_speech_timestamps(
-        self,
-        segments: List[dict],
-        speech_chunks: List[dict],
-        sampling_rate: Optional[int] = None,
-    ) -> List[dict]:
-        if sampling_rate is None:
-            sampling_rate = self.sampling_rate
-        ts_map = SpeechTimestampsMap(speech_chunks, sampling_rate)
-        for segment in segments:
-            segment["start"] = ts_map.get_original_time(segment["start"])
-            segment["end"] = ts_map.get_original_time(segment["end"])
-        return segments

modules/whisper/__init__.py DELETED Viewed

File without changes

modules/whisper/__pycache__/__init__.cpython-310.pyc DELETED Viewed

Binary file (175 Bytes)

modules/whisper/__pycache__/faster_whisper_inference.cpython-310.pyc DELETED Viewed

Binary file (6.51 kB)

modules/whisper/__pycache__/whisper_base.cpython-310.pyc DELETED Viewed

Binary file (12.9 kB)

modules/whisper/__pycache__/whisper_factory.cpython-310.pyc DELETED Viewed

Binary file (2.87 kB)

modules/whisper/__pycache__/whisper_parameter.cpython-310.pyc DELETED Viewed

Binary file (3.68 kB)

modules/whisper/faster_whisper_inference.py DELETED Viewed

@@ -1,191 +0,0 @@
-import os
-import time
-import numpy as np
-import torch
-from typing import BinaryIO, Union, Tuple, List
-import faster_whisper
-from faster_whisper.vad import VadOptions
-import ast
-import ctranslate2
-import whisper
-import gradio as gr
-from argparse import Namespace
-from modules.whisper.whisper_parameter import *
-from modules.whisper.whisper_base import WhisperBase
-class FasterWhisperInference(WhisperBase):
-    def __init__(self,
-                 model_dir: str = os.path.join("models", "Whisper", "faster-whisper"),
-                 diarization_model_dir: str = os.path.join("models", "Diarization"),
-                 output_dir: str = os.path.join("outputs"),
-                 ):
-        super().__init__(
-            model_dir=model_dir,
-            diarization_model_dir=diarization_model_dir,
-            output_dir=output_dir
-        )
-        self.model_dir = model_dir
-        os.makedirs(self.model_dir, exist_ok=True)
-        self.model_paths = self.get_model_paths()
-        self.device = self.get_device()
-        self.available_models = self.model_paths.keys()
-        self.available_compute_types = ctranslate2.get_supported_compute_types(
-            "cuda") if self.device == "cuda" else ctranslate2.get_supported_compute_types("cpu")
-    def transcribe(self,
-                   audio: Union[str, BinaryIO, np.ndarray],
-                   progress: gr.Progress,
-                   *whisper_params,
-                   ) -> Tuple[List[dict], float]:
-        """
-        transcribe method for faster-whisper.
-        Parameters
-        ----------
-        audio: Union[str, BinaryIO, np.ndarray]
-            Audio path or file binary or Audio numpy array
-        progress: gr.Progress
-            Indicator to show progress directly in gradio.
-        *whisper_params: tuple
-            Parameters related with whisper. This will be dealt with "WhisperParameters" data class
-        Returns
-        ----------
-        segments_result: List[dict]
-            list of dicts that includes start, end timestamps and transcribed text
-        elapsed_time: float
-            elapsed time for transcription
-        """
-        start_time = time.time()
-        params = WhisperParameters.as_value(*whisper_params)
-        if params.model_size != self.current_model_size or self.model is None or self.current_compute_type != params.compute_type:
-            self.update_model(params.model_size, params.compute_type, progress)
-        # None parameters with Textboxes: https://github.com/gradio-app/gradio/issues/8723
-        if not params.initial_prompt:
-            params.initial_prompt = None
-        if not params.prefix:
-            params.prefix = None
-        if not params.hotwords:
-            params.hotwords = None
-        params.suppress_tokens = self.format_suppress_tokens_str(params.suppress_tokens)
-        segments, info = self.model.transcribe(
-            audio=audio,
-            language=params.lang,
-            task="translate" if params.is_translate and self.current_model_size in self.translatable_models else "transcribe",
-            beam_size=params.beam_size,
-            log_prob_threshold=params.log_prob_threshold,
-            no_speech_threshold=params.no_speech_threshold,
-            best_of=params.best_of,
-            patience=params.patience,
-            temperature=params.temperature,
-            initial_prompt=params.initial_prompt,
-            compression_ratio_threshold=params.compression_ratio_threshold,
-            length_penalty=params.length_penalty,
-            repetition_penalty=params.repetition_penalty,
-            no_repeat_ngram_size=params.no_repeat_ngram_size,
-            prefix=params.prefix,
-            suppress_blank=params.suppress_blank,
-            suppress_tokens=params.suppress_tokens,
-            max_initial_timestamp=params.max_initial_timestamp,
-            word_timestamps=params.word_timestamps,
-            prepend_punctuations=params.prepend_punctuations,
-            append_punctuations=params.append_punctuations,
-            max_new_tokens=params.max_new_tokens,
-            chunk_length=params.chunk_length,
-            hallucination_silence_threshold=params.hallucination_silence_threshold,
-            hotwords=params.hotwords,
-            language_detection_threshold=params.language_detection_threshold,
-            language_detection_segments=params.language_detection_segments,
-            prompt_reset_on_temperature=params.prompt_reset_on_temperature,
-        )
-        progress(0, desc="Loading audio..")
-        segments_result = []
-        for segment in segments:
-            progress(segment.start / info.duration, desc="Transcribing..")
-            segments_result.append({
-                "start": segment.start,
-                "end": segment.end,
-                "text": segment.text
-            })
-        elapsed_time = time.time() - start_time
-        return segments_result, elapsed_time
-    def update_model(self,
-                     model_size: str,
-                     compute_type: str,
-                     progress: gr.Progress
-                     ):
-        """
-        Update current model setting
-        Parameters
-        ----------
-        model_size: str
-            Size of whisper model
-        compute_type: str
-            Compute type for transcription.
-            see more info : https://opennmt.net/CTranslate2/quantization.html
-        progress: gr.Progress
-            Indicator to show progress directly in gradio.
-        """
-        progress(0, desc="Initializing Model..")
-        self.current_model_size = self.model_paths[model_size]
-        self.current_compute_type = compute_type
-        self.model = faster_whisper.WhisperModel(
-            device=self.device,
-            model_size_or_path=self.current_model_size,
-            download_root=self.model_dir,
-            compute_type=self.current_compute_type
-        )
-    def get_model_paths(self):
-        """
-        Get available models from models path including fine-tuned model.
-        Returns
-        ----------
-        Name list of models
-        """
-        model_paths = {model:model for model in whisper.available_models()}
-        faster_whisper_prefix = "models--Systran--faster-whisper-"
-        existing_models = os.listdir(self.model_dir)
-        wrong_dirs = [".locks"]
-        existing_models = list(set(existing_models) - set(wrong_dirs))
-        webui_dir = os.getcwd()
-        for model_name in existing_models:
-            if faster_whisper_prefix in model_name:
-                model_name = model_name[len(faster_whisper_prefix):]
-            if model_name not in whisper.available_models():
-                model_paths[model_name] = os.path.join(webui_dir, self.model_dir, model_name)
-        return model_paths
-    @staticmethod
-    def get_device():
-        if torch.cuda.is_available():
-            return "cuda"
-        else:
-            return "auto"
-    @staticmethod
-    def format_suppress_tokens_str(suppress_tokens_str: str) -> List[int]:
-        try:
-            suppress_tokens = ast.literal_eval(suppress_tokens_str)
-            if not isinstance(suppress_tokens, list) or not all(isinstance(item, int) for item in suppress_tokens):
-                raise ValueError("Invalid Suppress Tokens. The value must be type of List[int]")
-            return suppress_tokens
-        except Exception as e:
-            raise ValueError("Invalid Suppress Tokens. The value must be type of List[int]")

modules/whisper/insanely_fast_whisper_inference.py DELETED Viewed

@@ -1,185 +0,0 @@
-import os
-import time
-import numpy as np
-from typing import BinaryIO, Union, Tuple, List
-import torch
-from transformers import pipeline
-from transformers.utils import is_flash_attn_2_available
-import gradio as gr
-from huggingface_hub import hf_hub_download
-import whisper
-from rich.progress import Progress, TimeElapsedColumn, BarColumn, TextColumn
-from argparse import Namespace
-from modules.whisper.whisper_parameter import *
-from modules.whisper.whisper_base import WhisperBase
-class InsanelyFastWhisperInference(WhisperBase):
-    def __init__(self,
-                 model_dir: str = os.path.join("models", "Whisper", "insanely-fast-whisper"),
-                 diarization_model_dir: str = os.path.join("models", "Diarization"),
-                 output_dir: str = os.path.join("outputs"),
-                 ):
-        super().__init__(
-            model_dir=model_dir,
-            output_dir=output_dir,
-            diarization_model_dir=diarization_model_dir
-        )
-        self.model_dir = model_dir
-        os.makedirs(self.model_dir, exist_ok=True)
-        openai_models = whisper.available_models()
-        distil_models = ["distil-large-v2", "distil-large-v3", "distil-medium.en", "distil-small.en"]
-        self.available_models = openai_models + distil_models
-        self.available_compute_types = ["float16"]
-    def transcribe(self,
-                   audio: Union[str, np.ndarray, torch.Tensor],
-                   progress: gr.Progress,
-                   *whisper_params,
-                   ) -> Tuple[List[dict], float]:
-        """
-        transcribe method for faster-whisper.
-        Parameters
-        ----------
-        audio: Union[str, BinaryIO, np.ndarray]
-            Audio path or file binary or Audio numpy array
-        progress: gr.Progress
-            Indicator to show progress directly in gradio.
-        *whisper_params: tuple
-            Parameters related with whisper. This will be dealt with "WhisperParameters" data class
-        Returns
-        ----------
-        segments_result: List[dict]
-            list of dicts that includes start, end timestamps and transcribed text
-        elapsed_time: float
-            elapsed time for transcription
-        """
-        start_time = time.time()
-        params = WhisperParameters.as_value(*whisper_params)
-        if params.model_size != self.current_model_size or self.model is None or self.current_compute_type != params.compute_type:
-            self.update_model(params.model_size, params.compute_type, progress)
-        progress(0, desc="Transcribing...Progress is not shown in insanely-fast-whisper.")
-        with Progress(
-                TextColumn("[progress.description]{task.description}"),
-                BarColumn(style="yellow1", pulse_style="white"),
-                TimeElapsedColumn(),
-        ) as progress:
-            progress.add_task("[yellow]Transcribing...", total=None)
-            segments = self.model(
-                inputs=audio,
-                return_timestamps=True,
-                chunk_length_s=params.chunk_length_s,
-                batch_size=params.batch_size,
-                generate_kwargs={
-                    "language": params.lang,
-                    "task": "translate" if params.is_translate and self.current_model_size in self.translatable_models else "transcribe",
-                    "no_speech_threshold": params.no_speech_threshold,
-                    "temperature": params.temperature,
-                    "compression_ratio_threshold": params.compression_ratio_threshold
-                }
-            )
-        segments_result = self.format_result(
-            transcribed_result=segments,
-        )
-        elapsed_time = time.time() - start_time
-        return segments_result, elapsed_time
-    def update_model(self,
-                     model_size: str,
-                     compute_type: str,
-                     progress: gr.Progress,
-                     ):
-        """
-        Update current model setting
-        Parameters
-        ----------
-        model_size: str
-            Size of whisper model
-        compute_type: str
-            Compute type for transcription.
-            see more info : https://opennmt.net/CTranslate2/quantization.html
-        progress: gr.Progress
-            Indicator to show progress directly in gradio.
-        """
-        progress(0, desc="Initializing Model..")
-        model_path = os.path.join(self.model_dir, model_size)
-        if not os.path.isdir(model_path) or not os.listdir(model_path):
-            self.download_model(
-                model_size=model_size,
-                download_root=model_path,
-                progress=progress
-            )
-        self.current_compute_type = compute_type
-        self.current_model_size = model_size
-        self.model = pipeline(
-            "automatic-speech-recognition",
-            model=os.path.join(self.model_dir, model_size),
-            torch_dtype=self.current_compute_type,
-            device=self.device,
-            model_kwargs={"attn_implementation": "flash_attention_2"} if is_flash_attn_2_available() else {"attn_implementation": "sdpa"},
-        )
-    @staticmethod
-    def format_result(
-        transcribed_result: dict
-    ) -> List[dict]:
-        """
-        Format the transcription result of insanely_fast_whisper as the same with other implementation.
-        Parameters
-        ----------
-        transcribed_result: dict
-            Transcription result of the insanely_fast_whisper
-        Returns
-        ----------
-        result: List[dict]
-            Formatted result as the same with other implementation
-        """
-        result = transcribed_result["chunks"]
-        for item in result:
-            start, end = item["timestamp"][0], item["timestamp"][1]
-            if end is None:
-                end = start
-            item["start"] = start
-            item["end"] = end
-        return result
-    @staticmethod
-    def download_model(
-        model_size: str,
-        download_root: str,
-        progress: gr.Progress
-    ):
-        progress(0, 'Initializing model..')
-        print(f'Downloading {model_size} to "{download_root}"....')
-        os.makedirs(download_root, exist_ok=True)
-        download_list = [
-            "model.safetensors",
-            "config.json",
-            "generation_config.json",
-            "preprocessor_config.json",
-            "tokenizer.json",
-            "tokenizer_config.json",
-            "added_tokens.json",
-            "special_tokens_map.json",
-            "vocab.json",
-        ]
-        if model_size.startswith("distil"):
-            repo_id = f"distil-whisper/{model_size}"
-        else:
-            repo_id = f"openai/whisper-{model_size}"
-        for item in download_list:
-            hf_hub_download(repo_id=repo_id, filename=item, local_dir=download_root)

modules/whisper/whisper_Inference.py DELETED Viewed

@@ -1,101 +0,0 @@
-import whisper
-import gradio as gr
-import time
-from typing import BinaryIO, Union, Tuple, List
-import numpy as np
-import torch
-import os
-from argparse import Namespace
-from modules.whisper.whisper_base import WhisperBase
-from modules.whisper.whisper_parameter import *
-class WhisperInference(WhisperBase):
-    def __init__(self,
-                 model_dir: str = os.path.join("models", "Whisper"),
-                 diarization_model_dir: str = os.path.join("models", "Diarization"),
-                 output_dir: str = os.path.join("outputs"),
-                 ):
-        super().__init__(
-            model_dir=model_dir,
-            output_dir=output_dir,
-            diarization_model_dir=diarization_model_dir
-        )
-    def transcribe(self,
-                   audio: Union[str, np.ndarray, torch.Tensor],
-                   progress: gr.Progress,
-                   *whisper_params,
-                   ) -> Tuple[List[dict], float]:
-        """
-        transcribe method for faster-whisper.
-        Parameters
-        ----------
-        audio: Union[str, BinaryIO, np.ndarray]
-            Audio path or file binary or Audio numpy array
-        progress: gr.Progress
-            Indicator to show progress directly in gradio.
-        *whisper_params: tuple
-            Parameters related with whisper. This will be dealt with "WhisperParameters" data class
-        Returns
-        ----------
-        segments_result: List[dict]
-            list of dicts that includes start, end timestamps and transcribed text
-        elapsed_time: float
-            elapsed time for transcription
-        """
-        start_time = time.time()
-        params = WhisperParameters.as_value(*whisper_params)
-        if params.model_size != self.current_model_size or self.model is None or self.current_compute_type != params.compute_type:
-            self.update_model(params.model_size, params.compute_type, progress)
-        def progress_callback(progress_value):
-            progress(progress_value, desc="Transcribing..")
-        segments_result = self.model.transcribe(audio=audio,
-                                                language=params.lang,
-                                                verbose=False,
-                                                beam_size=params.beam_size,
-                                                logprob_threshold=params.log_prob_threshold,
-                                                no_speech_threshold=params.no_speech_threshold,
-                                                task="translate" if params.is_translate and self.current_model_size in self.translatable_models else "transcribe",
-                                                fp16=True if params.compute_type == "float16" else False,
-                                                best_of=params.best_of,
-                                                patience=params.patience,
-                                                temperature=params.temperature,
-                                                compression_ratio_threshold=params.compression_ratio_threshold,
-                                                progress_callback=progress_callback,)["segments"]
-        elapsed_time = time.time() - start_time
-        return segments_result, elapsed_time
-    def update_model(self,
-                     model_size: str,
-                     compute_type: str,
-                     progress: gr.Progress,
-                     ):
-        """
-        Update current model setting
-        Parameters
-        ----------
-        model_size: str
-            Size of whisper model
-        compute_type: str
-            Compute type for transcription.
-            see more info : https://opennmt.net/CTranslate2/quantization.html
-        progress: gr.Progress
-            Indicator to show progress directly in gradio.
-        """
-        progress(0, desc="Initializing Model..")
-        self.current_compute_type = compute_type
-        self.current_model_size = model_size
-        self.model = whisper.load_model(
-            name=model_size,
-            device=self.device,
-            download_root=self.model_dir
-        )

modules/whisper/whisper_base.py DELETED Viewed

@@ -1,436 +0,0 @@
-import os
-import torch
-import whisper
-import gradio as gr
-from abc import ABC, abstractmethod
-from typing import BinaryIO, Union, Tuple, List
-import numpy as np
-from datetime import datetime
-from faster_whisper.vad import VadOptions
-from dataclasses import astuple
-from modules.utils.subtitle_manager import get_srt, get_vtt, get_txt, write_file, safe_filename
-from modules.utils.youtube_manager import get_ytdata, get_ytaudio
-from modules.utils.files_manager import get_media_files, format_gradio_files
-from modules.whisper.whisper_parameter import *
-from modules.diarize.diarizer import Diarizer
-from modules.vad.silero_vad import SileroVAD
-class WhisperBase(ABC):
-    def __init__(self,
-                 model_dir: str = os.path.join("models", "Whisper"),
-                 diarization_model_dir: str = os.path.join("models", "Diarization"),
-                 output_dir: str = os.path.join("outputs"),
-                 ):
-        self.model_dir = model_dir
-        self.output_dir = output_dir
-        os.makedirs(self.output_dir, exist_ok=True)
-        os.makedirs(self.model_dir, exist_ok=True)
-        self.diarizer = Diarizer(
-            model_dir=diarization_model_dir
-        )
-        self.vad = SileroVAD()
-        self.model = None
-        self.current_model_size = None
-        self.available_models = whisper.available_models()
-        self.available_langs = sorted(list(whisper.tokenizer.LANGUAGES.values()))
-        self.translatable_models = ["large", "large-v1", "large-v2", "large-v3"]
-        self.device = self.get_device()
-        self.available_compute_types = ["float16", "float32"]
-        self.current_compute_type = "float16" if self.device == "cuda" else "float32"
-    @abstractmethod
-    def transcribe(self,
-                   audio: Union[str, BinaryIO, np.ndarray],
-                   progress: gr.Progress,
-                   *whisper_params,
-                   ):
-        """Inference whisper model to transcribe"""
-        pass
-    @abstractmethod
-    def update_model(self,
-                     model_size: str,
-                     compute_type: str,
-                     progress: gr.Progress
-                     ):
-        """Initialize whisper model"""
-        pass
-    def run(self,
-            audio: Union[str, BinaryIO, np.ndarray],
-            progress: gr.Progress,
-            *whisper_params,
-            ) -> Tuple[List[dict], float]:
-        """
-        Run transcription with conditional pre-processing and post-processing.
-        The VAD will be performed to remove noise from the audio input in pre-processing, if enabled.
-        The diarization will be performed in post-processing, if enabled.
-        Parameters
-        ----------
-        audio: Union[str, BinaryIO, np.ndarray]
-            Audio input. This can be file path or binary type.
-        progress: gr.Progress
-            Indicator to show progress directly in gradio.
-        *whisper_params: tuple
-            Parameters related with whisper. This will be dealt with "WhisperParameters" data class
-        Returns
-        ----------
-        segments_result: List[dict]
-            list of dicts that includes start, end timestamps and transcribed text
-        elapsed_time: float
-            elapsed time for running
-        """
-        params = WhisperParameters.as_value(*whisper_params)
-        if params.lang == "Automatic Detection":
-            params.lang = None
-        else:
-            language_code_dict = {value: key for key, value in whisper.tokenizer.LANGUAGES.items()}
-            params.lang = language_code_dict[params.lang]
-        speech_chunks = None
-        if params.vad_filter:
-            # Explicit value set for float('inf') from gr.Number()
-            if params.max_speech_duration_s >= 9999:
-                params.max_speech_duration_s = float('inf')
-            vad_options = VadOptions(
-                threshold=params.threshold,
-                min_speech_duration_ms=params.min_speech_duration_ms,
-                max_speech_duration_s=params.max_speech_duration_s,
-                min_silence_duration_ms=params.min_silence_duration_ms,
-                speech_pad_ms=params.speech_pad_ms
-            )
-            audio, speech_chunks = self.vad.run(
-                audio=audio,
-                vad_parameters=vad_options,
-                progress=progress
-            )
-        result, elapsed_time = self.transcribe(
-            audio,
-            progress,
-            *astuple(params)
-        )
-        if params.vad_filter:
-            result = self.vad.restore_speech_timestamps(
-                segments=result,
-                speech_chunks=speech_chunks,
-            )
-        if params.is_diarize:
-            result, elapsed_time_diarization = self.diarizer.run(
-                audio=audio,
-                use_auth_token=params.hf_token,
-                transcribed_result=result,
-            )
-            elapsed_time += elapsed_time_diarization
-        return result, elapsed_time
-    def transcribe_file(self,
-                        files: list,
-                        input_folder_path: str,
-                        file_format: str,
-                        add_timestamp: bool,
-                        progress=gr.Progress(),
-                        *whisper_params,
-                        ) -> list:
-        """
-        Write subtitle file from Files
-        Parameters
-        ----------
-        files: list
-            List of files to transcribe from gr.Files()
-        input_folder_path: str
-            Input folder path to transcribe from gr.Textbox(). If this is provided, `files` will be ignored and
-            this will be used instead.
-        file_format: str
-            Subtitle File format to write from gr.Dropdown(). Supported format: [SRT, WebVTT, txt]
-        add_timestamp: bool
-            Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the subtitle filename.
-        progress: gr.Progress
-            Indicator to show progress directly in gradio.
-        *whisper_params: tuple
-            Parameters related with whisper. This will be dealt with "WhisperParameters" data class
-        Returns
-        ----------
-        result_str:
-            Result of transcription to return to gr.Textbox()
-        result_file_path:
-            Output file path to return to gr.Files()
-        """
-        try:
-            if input_folder_path:
-                files = get_media_files(input_folder_path)
-                files = format_gradio_files(files)
-            files_info = {}
-            for file in files:
-                transcribed_segments, time_for_task = self.run(
-                    file.name,
-                    progress,
-                    *whisper_params,
-                )
-                file_name, file_ext = os.path.splitext(os.path.basename(file.name))
-                subtitle, file_path = self.generate_and_write_file(
-                    file_name=file_name,
-                    transcribed_segments=transcribed_segments,
-                    add_timestamp=add_timestamp,
-                    file_format=file_format,
-                    output_dir=self.output_dir
-                )
-                files_info[file_name] = {"subtitle": subtitle, "time_for_task": time_for_task, "path": file_path}
-            total_result = ''
-            total_time = 0
-            for file_name, info in files_info.items():
-                total_result += '------------------------------------\n'
-                total_result += f'{file_name}\n\n'
-                total_result += f'{info["subtitle"]}'
-                total_time += info["time_for_task"]
-            result_str = f"Done in {self.format_time(total_time)}! Subtitle is in the outputs folder.\n\n{total_result}"
-            result_file_path = [info['path'] for info in files_info.values()]
-            return [result_str, result_file_path]
-        except Exception as e:
-            print(f"Error transcribing file: {e}")
-        finally:
-            self.release_cuda_memory()
-            if not files:
-                self.remove_input_files([file.name for file in files])
-    def transcribe_mic(self,
-                       mic_audio: str,
-                       file_format: str,
-                       progress=gr.Progress(),
-                       *whisper_params,
-                       ) -> list:
-        """
-        Write subtitle file from microphone
-        Parameters
-        ----------
-        mic_audio: str
-            Audio file path from gr.Microphone()
-        file_format: str
-            Subtitle File format to write from gr.Dropdown(). Supported format: [SRT, WebVTT, txt]
-        progress: gr.Progress
-            Indicator to show progress directly in gradio.
-        *whisper_params: tuple
-            Parameters related with whisper. This will be dealt with "WhisperParameters" data class
-        Returns
-        ----------
-        result_str:
-            Result of transcription to return to gr.Textbox()
-        result_file_path:
-            Output file path to return to gr.Files()
-        """
-        try:
-            progress(0, desc="Loading Audio..")
-            transcribed_segments, time_for_task = self.run(
-                mic_audio,
-                progress,
-                *whisper_params,
-            )
-            progress(1, desc="Completed!")
-            subtitle, result_file_path = self.generate_and_write_file(
-                file_name="Mic",
-                transcribed_segments=transcribed_segments,
-                add_timestamp=True,
-                file_format=file_format,
-                output_dir=self.output_dir
-            )
-            result_str = f"Done in {self.format_time(time_for_task)}! Subtitle file is in the outputs folder.\n\n{subtitle}"
-            return [result_str, result_file_path]
-        except Exception as e:
-            print(f"Error transcribing file: {e}")
-        finally:
-            self.release_cuda_memory()
-            self.remove_input_files([mic_audio])
-    def transcribe_youtube(self,
-                           youtube_link: str,
-                           file_format: str,
-                           add_timestamp: bool,
-                           progress=gr.Progress(),
-                           *whisper_params,
-                           ) -> list:
-        """
-        Write subtitle file from Youtube
-        Parameters
-        ----------
-        youtube_link: str
-            URL of the Youtube video to transcribe from gr.Textbox()
-        file_format: str
-            Subtitle File format to write from gr.Dropdown(). Supported format: [SRT, WebVTT, txt]
-        add_timestamp: bool
-            Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename.
-        progress: gr.Progress
-            Indicator to show progress directly in gradio.
-        *whisper_params: tuple
-            Parameters related with whisper. This will be dealt with "WhisperParameters" data class
-        Returns
-        ----------
-        result_str:
-            Result of transcription to return to gr.Textbox()
-        result_file_path:
-            Output file path to return to gr.Files()
-        """
-        try:
-            progress(0, desc="Loading Audio from Youtube..")
-            yt = get_ytdata(youtube_link)
-            audio = get_ytaudio(yt)
-            transcribed_segments, time_for_task = self.run(
-                audio,
-                progress,
-                *whisper_params,
-            )
-            progress(1, desc="Completed!")
-            file_name = safe_filename(yt.title)
-            subtitle, result_file_path = self.generate_and_write_file(
-                file_name=file_name,
-                transcribed_segments=transcribed_segments,
-                add_timestamp=add_timestamp,
-                file_format=file_format,
-                output_dir=self.output_dir
-            )
-            result_str = f"Done in {self.format_time(time_for_task)}! Subtitle file is in the outputs folder.\n\n{subtitle}"
-            return [result_str, result_file_path]
-        except Exception as e:
-            print(f"Error transcribing file: {e}")
-        finally:
-            try:
-                if 'yt' not in locals():
-                    yt = get_ytdata(youtube_link)
-                    file_path = get_ytaudio(yt)
-                else:
-                    file_path = get_ytaudio(yt)
-                self.release_cuda_memory()
-                self.remove_input_files([file_path])
-            except Exception as cleanup_error:
-                pass
-    @staticmethod
-    def generate_and_write_file(file_name: str,
-                                transcribed_segments: list,
-                                add_timestamp: bool,
-                                file_format: str,
-                                output_dir: str
-                                ) -> str:
-        """
-        Writes subtitle file
-        Parameters
-        ----------
-        file_name: str
-            Output file name
-        transcribed_segments: list
-            Text segments transcribed from audio
-        add_timestamp: bool
-            Determines whether to add a timestamp to the end of the filename.
-        file_format: str
-            File format to write. Supported formats: [SRT, WebVTT, txt]
-        output_dir: str
-            Directory path of the output
-        Returns
-        ----------
-        content: str
-            Result of the transcription
-        output_path: str
-            output file path
-        """
-        if add_timestamp:
-            timestamp = datetime.now().strftime("%m%d%H%M%S")
-            output_path = os.path.join(output_dir, f"{file_name}-{timestamp}")
-        else:
-            output_path = os.path.join(output_dir, f"{file_name}")
-        if file_format == "SRT":
-            content = get_srt(transcribed_segments)
-            output_path += '.srt'
-        elif file_format == "WebVTT":
-            content = get_vtt(transcribed_segments)
-            output_path += '.vtt'
-        elif file_format == "txt":
-            content = get_txt(transcribed_segments)
-            output_path += '.txt'
-        write_file(content, output_path)
-        return content, output_path
-    @staticmethod
-    def format_time(elapsed_time: float) -> str:
-        """
-        Get {hours} {minutes} {seconds} time format string
-        Parameters
-        ----------
-        elapsed_time: str
-            Elapsed time for transcription
-        Returns
-        ----------
-        Time format string
-        """
-        hours, rem = divmod(elapsed_time, 3600)
-        minutes, seconds = divmod(rem, 60)
-        time_str = ""
-        if hours:
-            time_str += f"{hours} hours "
-        if minutes:
-            time_str += f"{minutes} minutes "
-        seconds = round(seconds)
-        time_str += f"{seconds} seconds"
-        return time_str.strip()
-    @staticmethod
-    def get_device():
-        if torch.cuda.is_available():
-            return "cuda"
-        elif torch.backends.mps.is_available():
-            return "mps"
-        else:
-            return "cpu"
-    @staticmethod
-    def release_cuda_memory():
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
-            torch.cuda.reset_max_memory_allocated()
-    @staticmethod
-    def remove_input_files(file_paths: List[str]):
-        if not file_paths:
-            return
-        for file_path in file_paths:
-            if file_path and os.path.exists(file_path):
-                os.remove(file_path)

modules/whisper/whisper_factory.py DELETED Viewed

@@ -1,81 +0,0 @@
-from typing import Optional
-import os
-from modules.whisper.faster_whisper_inference import FasterWhisperInference
-from modules.whisper.whisper_Inference import WhisperInference
-from modules.whisper.insanely_fast_whisper_inference import InsanelyFastWhisperInference
-from modules.whisper.whisper_base import WhisperBase
-class WhisperFactory:
-    @staticmethod
-    def create_whisper_inference(
-        whisper_type: str,
-        whisper_model_dir: str = os.path.join("models", "Whisper"),
-        faster_whisper_model_dir: str = os.path.join("models", "Whisper", "faster-whisper"),
-        insanely_fast_whisper_model_dir: str = os.path.join("models", "Whisper", "insanely-fast-whisper"),
-        diarization_model_dir: str = os.path.join("models", "Diarization"),
-        output_dir: str = os.path.join("outputs"),
-    ) -> "WhisperBase":
-        """
-        Create a whisper inference class based on the provided whisper_type.
-        Parameters
-        ----------
-        whisper_type : str
-            The type of Whisper implementation to use. Supported values (case-insensitive):
-            - "faster-whisper": https://github.com/openai/whisper
-            - "whisper": https://github.com/openai/whisper
-            - "insanely-fast-whisper": https://github.com/Vaibhavs10/insanely-fast-whisper
-        whisper_model_dir : str
-            Directory path for the Whisper model.
-        faster_whisper_model_dir : str
-            Directory path for the Faster Whisper model.
-        insanely_fast_whisper_model_dir : str
-            Directory path for the Insanely Fast Whisper model.
-        diarization_model_dir : str
-            Directory path for the diarization model.
-        output_dir : str
-            Directory path where output files will be saved.
-        Returns
-        -------
-        WhisperBase
-            An instance of the appropriate whisper inference class based on the whisper_type.
-        """
-        # Temporal fix of the bug : https://github.com/jhj0517/Whisper-WebUI/issues/144
-        os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
-        whisper_type = whisper_type.lower().strip()
-        faster_whisper_typos = ["faster_whisper", "faster-whisper", "fasterwhisper"]
-        whisper_typos = ["whisper"]
-        insanely_fast_whisper_typos = [
-            "insanely_fast_whisper", "insanely-fast-whisper", "insanelyfastwhisper",
-            "insanely_faster_whisper", "insanely-faster-whisper", "insanelyfasterwhisper"
-        ]
-        if whisper_type in faster_whisper_typos:
-            return FasterWhisperInference(
-                model_dir=faster_whisper_model_dir,
-                output_dir=output_dir,
-                diarization_model_dir=diarization_model_dir
-            )
-        elif whisper_type in whisper_typos:
-            return WhisperInference(
-                model_dir=whisper_model_dir,
-                output_dir=output_dir,
-                diarization_model_dir=diarization_model_dir
-            )
-        elif whisper_type in insanely_fast_whisper_typos:
-            return InsanelyFastWhisperInference(
-                model_dir=insanely_fast_whisper_model_dir,
-                output_dir=output_dir,
-                diarization_model_dir=diarization_model_dir
-            )
-        else:
-            return FasterWhisperInference(
-                model_dir=faster_whisper_model_dir,
-                output_dir=output_dir,
-                diarization_model_dir=diarization_model_dir
-            )

modules/whisper/whisper_parameter.py DELETED Viewed

@@ -1,277 +0,0 @@
-from dataclasses import dataclass, fields
-import gradio as gr
-from typing import Optional
-@dataclass
-class WhisperParameters:
-    model_size: gr.Dropdown
-    lang: gr.Dropdown
-    is_translate: gr.Checkbox
-    beam_size: gr.Number
-    log_prob_threshold: gr.Number
-    no_speech_threshold: gr.Number
-    compute_type: gr.Dropdown
-    best_of: gr.Number
-    patience: gr.Number
-    condition_on_previous_text: gr.Checkbox
-    prompt_reset_on_temperature: gr.Slider
-    initial_prompt: gr.Textbox
-    temperature: gr.Slider
-    compression_ratio_threshold: gr.Number
-    vad_filter: gr.Checkbox
-    threshold: gr.Slider
-    min_speech_duration_ms: gr.Number
-    max_speech_duration_s: gr.Number
-    min_silence_duration_ms: gr.Number
-    speech_pad_ms: gr.Number
-    chunk_length_s: gr.Number
-    batch_size: gr.Number
-    is_diarize: gr.Checkbox
-    hf_token: gr.Textbox
-    diarization_device: gr.Dropdown
-    length_penalty: gr.Number
-    repetition_penalty: gr.Number
-    no_repeat_ngram_size: gr.Number
-    prefix: gr.Textbox
-    suppress_blank: gr.Checkbox
-    suppress_tokens: gr.Textbox
-    max_initial_timestamp: gr.Number
-    word_timestamps: gr.Checkbox
-    prepend_punctuations: gr.Textbox
-    append_punctuations: gr.Textbox
-    max_new_tokens: gr.Number
-    chunk_length: gr.Number
-    hallucination_silence_threshold: gr.Number
-    hotwords: gr.Textbox
-    language_detection_threshold: gr.Number
-    language_detection_segments: gr.Number
-    """
-    A data class for Gradio components of the Whisper Parameters. Use "before" Gradio pre-processing.
-    This data class is used to mitigate the key-value problem between Gradio components and function parameters.
-    Related Gradio issue: https://github.com/gradio-app/gradio/issues/2471
-    See more about Gradio pre-processing: https://www.gradio.app/docs/components
-    Attributes
-    ----------
-    model_size: gr.Dropdown
-        Whisper model size.
-    lang: gr.Dropdown
-        Source language of the file to transcribe.
-    is_translate: gr.Checkbox
-        Boolean value that determines whether to translate to English.
-        It's Whisper's feature to translate speech from another language directly into English end-to-end.
-    beam_size: gr.Number
-        Int value that is used for decoding option.
-    log_prob_threshold: gr.Number
-        If the average log probability over sampled tokens is below this value, treat as failed.
-    no_speech_threshold: gr.Number
-        If the no_speech probability is higher than this value AND
-        the average log probability over sampled tokens is below `log_prob_threshold`,
-        consider the segment as silent.
-    compute_type: gr.Dropdown
-        compute type for transcription.
-        see more info : https://opennmt.net/CTranslate2/quantization.html
-    best_of: gr.Number
-        Number of candidates when sampling with non-zero temperature.
-    patience: gr.Number
-        Beam search patience factor.
-    condition_on_previous_text: gr.Checkbox
-        if True, the previous output of the model is provided as a prompt for the next window;
-        disabling may make the text inconsistent across windows, but the model becomes less prone to
-        getting stuck in a failure loop, such as repetition looping or timestamps going out of sync.
-    initial_prompt: gr.Textbox
-        Optional text to provide as a prompt for the first window. This can be used to provide, or
-        "prompt-engineer" a context for transcription, e.g. custom vocabularies or proper nouns
-        to make it more likely to predict those word correctly.
-    temperature: gr.Slider
-        Temperature for sampling. It can be a tuple of temperatures,
-        which will be successively used upon failures according to either
-        `compression_ratio_threshold` or `log_prob_threshold`.
-    compression_ratio_threshold: gr.Number
-        If the gzip compression ratio is above this value, treat as failed
-    vad_filter: gr.Checkbox
-        Enable the voice activity detection (VAD) to filter out parts of the audio
-        without speech. This step is using the Silero VAD model
-        https://github.com/snakers4/silero-vad.
-    threshold: gr.Slider
-        This parameter is related with Silero VAD. Speech threshold.
-        Silero VAD outputs speech probabilities for each audio chunk,
-        probabilities ABOVE this value are considered as SPEECH. It is better to tune this
-        parameter for each dataset separately, but "lazy" 0.5 is pretty good for most datasets.
-    min_speech_duration_ms: gr.Number
-        This parameter is related with Silero VAD. Final speech chunks shorter min_speech_duration_ms are thrown out.
-    max_speech_duration_s: gr.Number
-        This parameter is related with Silero VAD. Maximum duration of speech chunks in seconds. Chunks longer
-        than max_speech_duration_s will be split at the timestamp of the last silence that
-        lasts more than 100ms (if any), to prevent aggressive cutting. Otherwise, they will be
-        split aggressively just before max_speech_duration_s.
-    min_silence_duration_ms: gr.Number
-        This parameter is related with Silero VAD. In the end of each speech chunk wait for min_silence_duration_ms
-        before separating it
-    speech_pad_ms: gr.Number
-        This parameter is related with Silero VAD. Final speech chunks are padded by speech_pad_ms each side
-    chunk_length_s: gr.Number
-        This parameter is related with insanely-fast-whisper pipe.
-        Maximum length of each chunk
-    batch_size: gr.Number
-        This parameter is related with insanely-fast-whisper pipe. Batch size to pass to the pipe
-    is_diarize: gr.Checkbox
-        This parameter is related with whisperx. Boolean value that determines whether to diarize or not.
-    hf_token: gr.Textbox
-        This parameter is related with whisperx. Huggingface token is needed to download diarization models.
-        Read more about : https://huggingface.co/pyannote/speaker-diarization-3.1#requirements
-    diarization_device: gr.Dropdown
-        This parameter is related with whisperx. Device to run diarization model
-    length_penalty:
-        This parameter is related to faster-whisper. Exponential length penalty constant.
-    repetition_penalty:
-        This parameter is related to faster-whisper. Penalty applied to the score of previously generated tokens
-        (set > 1 to penalize).
-    no_repeat_ngram_size:
-        This parameter is related to faster-whisper. Prevent repetitions of n-grams with this size (set 0 to disable).
-    prefix:
-        This parameter is related to faster-whisper. Optional text to provide as a prefix for the first window.
-    suppress_blank:
-        This parameter is related to faster-whisper. Suppress blank outputs at the beginning of the sampling.
-    suppress_tokens:
-        This parameter is related to faster-whisper. List of token IDs to suppress. -1 will suppress a default set
-        of symbols as defined in the model config.json file.
-    max_initial_timestamp:
-        This parameter is related to faster-whisper. The initial timestamp cannot be later than this.
-    word_timestamps:
-        This parameter is related to faster-whisper. Extract word-level timestamps using the cross-attention pattern
-        and dynamic time warping, and include the timestamps for each word in each segment.
-    prepend_punctuations:
-        This parameter is related to faster-whisper. If word_timestamps is True, merge these punctuation symbols
-        with the next word.
-    append_punctuations:
-        This parameter is related to faster-whisper. If word_timestamps is True, merge these punctuation symbols
-        with the previous word.
-    max_new_tokens:
-        This parameter is related to faster-whisper. Maximum number of new tokens to generate per-chunk. If not set,
-        the maximum will be set by the default max_length.
-    chunk_length:
-        This parameter is related to faster-whisper. The length of audio segments. If it is not None, it will overwrite the
-        default chunk_length of the FeatureExtractor.
-    hallucination_silence_threshold:
-        This parameter is related to faster-whisper. When word_timestamps is True, skip silent periods longer than this threshold
-        (in seconds) when a possible hallucination is detected.
-    hotwords:
-        This parameter is related to faster-whisper. Hotwords/hint phrases to provide the model with. Has no effect if prefix is not None.
-    language_detection_threshold:
-        This parameter is related to faster-whisper. If the maximum probability of the language tokens is higher than this value, the language is detected.
-    language_detection_segments:
-        This parameter is related to faster-whisper. Number of segments to consider for the language detection.
-    """
-    def as_list(self) -> list:
-        """
-        Converts the data class attributes into a list, Use in Gradio UI before Gradio pre-processing.
-        See more about Gradio pre-processing: : https://www.gradio.app/docs/components
-        Returns
-        ----------
-        A list of Gradio components
-        """
-        return [getattr(self, f.name) for f in fields(self)]
-    @staticmethod
-    def as_value(*args) -> 'WhisperValues':
-        """
-        To use Whisper parameters in function after Gradio post-processing.
-        See more about Gradio post-processing: : https://www.gradio.app/docs/components
-        Returns
-        ----------
-        WhisperValues
-           Data class that has values of parameters
-        """
-        return WhisperValues(*args)
-@dataclass
-class WhisperValues:
-    model_size: str
-    lang: str
-    is_translate: bool
-    beam_size: int
-    log_prob_threshold: float
-    no_speech_threshold: float
-    compute_type: str
-    best_of: int
-    patience: float
-    condition_on_previous_text: bool
-    prompt_reset_on_temperature: float
-    initial_prompt: Optional[str]
-    temperature: float
-    compression_ratio_threshold: float
-    vad_filter: bool
-    threshold: float
-    min_speech_duration_ms: int
-    max_speech_duration_s: float
-    min_silence_duration_ms: int
-    speech_pad_ms: int
-    chunk_length_s: int
-    batch_size: int
-    is_diarize: bool
-    hf_token: str
-    diarization_device: str
-    length_penalty: float
-    repetition_penalty: float
-    no_repeat_ngram_size: int
-    prefix: Optional[str]
-    suppress_blank: bool
-    suppress_tokens: Optional[str]
-    max_initial_timestamp: float
-    word_timestamps: bool
-    prepend_punctuations: Optional[str]
-    append_punctuations: Optional[str]
-    max_new_tokens: Optional[int]
-    chunk_length: Optional[int]
-    hallucination_silence_threshold: Optional[float]
-    hotwords: Optional[str]
-    language_detection_threshold: Optional[float]
-    language_detection_segments: int
-    """
-    A data class to use Whisper parameters.
-    """

outputs/outputs are saved here.txt DELETED Viewed

File without changes

outputs/translations/outputs for translation are saved here.txt DELETED Viewed

File without changes