Audio-WebUI

Paused

App Files Files Community

kadirnar commited on Nov 28, 2023

Commit

bd8e31e

1 Parent(s): 580270a

update

Browse files

Files changed (1) hide show

app.py +64 -2

app.py CHANGED Viewed

@@ -1,9 +1,71 @@
 import gradio as gr
-from whisperplus.pipelines.whisper import SpeechToTextPipeline
 from whisperplus.utils.download_utils import download_and_convert_to_mp3
-from whisperplus.utils.text_utils import format_speech_to_dialogue
 def youtube_url_to_text(url, model_id, language_choice):
     """

 import gradio as gr
 from whisperplus.utils.download_utils import download_and_convert_to_mp3
+import logging
+import torch
+from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+class SpeechToTextPipeline:
+    """Class for converting audio to text using a pre-trained speech recognition model."""
+    def __init__(self, model_id: str = "openai/whisper-large-v3"):
+        self.model = None
+        self.device = None
+        if self.model is None:
+            self.load_model(model_id)
+        else:
+            logging.info("Model already loaded.")
+    def load_model(self, model_id: str = "openai/whisper-large-v3"):
+        """
+        Loads the pre-trained speech recognition model and moves it to the specified device.
+        Args:
+            model_id (str): Identifier of the pre-trained model to be loaded.
+        """
+        logging.info("Loading model...")
+        model = AutoModelForSpeechSeq2Seq.from_pretrained(
+            model_id, torch_dtype=torch.float16, low_cpu_mem_usage=True, use_safetensors=True)
+        model.to(self.device)
+        logging.info("Model loaded successfully.")
+        self.model = model
+    def __call__(self, audio_path: str, model_id: str = "openai/whisper-large-v3", language: str = "turkish"):
+        """
+        Converts audio to text using the pre-trained speech recognition model.
+        Args:
+            audio_path (str): Path to the audio file to be transcribed.
+            model_id (str): Identifier of the pre-trained model to be used for transcription.
+        Returns:
+            str: Transcribed text from the audio.
+        """
+        processor = AutoProcessor.from_pretrained(model_id)
+        pipe = pipeline(
+            "automatic-speech-recognition",
+            model=self.model,
+            torch_dtype=torch.float16,
+            chunk_length_s=30,
+            max_new_tokens=128,
+            batch_size=24,
+            return_timestamps=True,
+            device="cuda",
+            tokenizer=processor.tokenizer,
+            feature_extractor=processor.feature_extractor,
+            model_kwargs={"use_flash_attention_2": True},
+            generate_kwargs={"language": language},
+        )
+        logging.info("Transcribing audio...")
+        result = pipe(audio_path)["text"]
+        return result
 def youtube_url_to_text(url, model_id, language_choice):
     """