Audio-WebUI

Paused

App Files Files Community

kadirnar commited on Nov 28, 2023

Commit

70814d8

1 Parent(s): 2ec3c3e

Update app.py

Browse files

Files changed (1) hide show

app.py +122 -65

app.py CHANGED Viewed

@@ -1,71 +1,10 @@
 import gradio as gr
 from whisperplus.utils.download_utils import download_and_convert_to_mp3
-import logging
-import torch
-from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
-logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
-class SpeechToTextPipeline:
-    """Class for converting audio to text using a pre-trained speech recognition model."""
-    def __init__(self, model_id: str = "openai/whisper-large-v3"):
-        self.model = None
-        self.device = None
-        if self.model is None:
-            self.load_model(model_id)
-        else:
-            logging.info("Model already loaded.")
-    def load_model(self, model_id: str = "openai/whisper-large-v3"):
-        """
-        Loads the pre-trained speech recognition model and moves it to the specified device.
-        Args:
-            model_id (str): Identifier of the pre-trained model to be loaded.
-        """
-        logging.info("Loading model...")
-        model = AutoModelForSpeechSeq2Seq.from_pretrained(
-            model_id, torch_dtype=torch.float16, low_cpu_mem_usage=True, use_safetensors=True)
-        model.to(self.device)
-        logging.info("Model loaded successfully.")
-        self.model = model
-    def __call__(self, audio_path: str, model_id: str = "openai/whisper-large-v3", language: str = "turkish"):
-        """
-        Converts audio to text using the pre-trained speech recognition model.
-        Args:
-            audio_path (str): Path to the audio file to be transcribed.
-            model_id (str): Identifier of the pre-trained model to be used for transcription.
-        Returns:
-            str: Transcribed text from the audio.
-        """
-        processor = AutoProcessor.from_pretrained(model_id)
-        pipe = pipeline(
-            "automatic-speech-recognition",
-            model=self.model,
-            torch_dtype=torch.float16,
-            chunk_length_s=30,
-            max_new_tokens=128,
-            batch_size=24,
-            return_timestamps=True,
-            device="cuda",
-            tokenizer=processor.tokenizer,
-            feature_extractor=processor.feature_extractor,
-            model_kwargs={"use_flash_attention_2": True},
-            generate_kwargs={"language": language},
-        )
-        logging.info("Transcribing audio...")
-        result = pipe(audio_path)["text"]
-        return result
 def youtube_url_to_text(url, model_id, language_choice):
     """
@@ -88,6 +27,36 @@ def youtube_url_to_text(url, model_id, language_choice):
     return transcript, video_path
 def youtube_url_to_text_app():
     with gr.Blocks():
         with gr.Row():
@@ -134,6 +103,92 @@ def youtube_url_to_text_app():
             ],
             outputs=[output_text, output_audio],
         )
 gradio_app = gr.Blocks()
@@ -155,6 +210,8 @@ with gradio_app:
         with gr.Column():
             with gr.Tab(label="Youtube URL to Text"):
                 youtube_url_to_text_app()
 gradio_app.queue()
-gradio_app.launch(debug=True)

 import gradio as gr
+from whisperplus.pipelines.whisper import SpeechToTextPipeline
+from whisperplus.pipelines.whisper_diarize import ASRDiarizationPipeline
 from whisperplus.utils.download_utils import download_and_convert_to_mp3
+from whisperplus.utils.text_utils import format_speech_to_dialogue
 def youtube_url_to_text(url, model_id, language_choice):
     """
     return transcript, video_path
+def speaker_diarization(url, model_id, device, num_speakers, min_speaker, max_speaker):
+    """
+    Main function that downloads and converts a video to MP3 format, performs speech-to-text conversion using
+    a specified model, and returns the transcript along with the video path.
+    Args:
+        url (str): The URL of the video to download and convert.
+        model_id (str): The ID of the speech-to-text model to use.
+        language_choice (str): The language choice for the speech-to-text conversion.
+    Returns:
+        transcript (str): The transcript of the speech-to-text conversion.
+        video_path (str): The path of the downloaded video.
+    """
+    pipeline = ASRDiarizationPipeline.from_pretrained(
+        asr_model=model_id,
+        diarizer_model="pyannote/speaker-diarization",
+        use_auth_token="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",
+        chunk_length_s=30,
+        device=device,
+    )
+    audio_path = download_and_convert_to_mp3(url)
+    output_text = pipeline(
+        audio_path, num_speakers=num_speakers, min_speaker=min_speaker, max_speaker=max_speaker)
+    dialogue = format_speech_to_dialogue(output_text)
+    return dialogue, audio_path
 def youtube_url_to_text_app():
     with gr.Blocks():
         with gr.Row():
             ],
             outputs=[output_text, output_audio],
         )
+        gr.Examples(
+            examples=[
+                [
+                    "https://www.youtube.com/watch?v=di3rHkEZuUw",
+                    "openai/whisper-large-v3",
+                    "English",
+                ],
+            ],
+            fn=youtube_url_to_text,
+            inputs=[
+                youtube_url_path,
+                whisper_model_id,
+                language_choice,
+            ],
+            outputs=[output_text, output_audio],
+            cache_examples=True,
+        )
+def speaker_diarization_app():
+    with gr.Blocks():
+        with gr.Row():
+            with gr.Column():
+                youtube_url_path = gr.Text(placeholder="Enter Youtube URL", label="Youtube URL")
+                whisper_model_id = gr.Dropdown(
+                    choices=[
+                        "openai/whisper-large-v3",
+                        "openai/whisper-large",
+                        "openai/whisper-medium",
+                        "openai/whisper-base",
+                        "openai/whisper-small",
+                        "openai/whisper-tiny",
+                    ],
+                    value="openai/whisper-large-v3",
+                    label="Whisper Model",
+                )
+                device = gr.Dropdown(
+                    choices=["cpu", "cuda", "mps"],
+                    value="cuda",
+                    label="Device",
+                )
+                num_speakers = gr.Number(value=2, label="Number of Speakers")
+                min_speaker = gr.Number(value=1, label="Minimum Number of Speakers")
+                max_speaker = gr.Number(value=2, label="Maximum Number of Speakers")
+                whisperplus_in_predict = gr.Button(value="Generator")
+            with gr.Column():
+                output_text = gr.Textbox(label="Output Text")
+                output_audio = gr.Audio(label="Output Audio")
+        whisperplus_in_predict.click(
+            fn=speaker_diarization,
+            inputs=[
+                youtube_url_path,
+                whisper_model_id,
+                device,
+                num_speakers,
+                min_speaker,
+                max_speaker,
+            ],
+            outputs=[output_text, output_audio],
+        )
+        gr.Examples(
+            examples=[
+                [
+                    "https://www.youtube.com/shorts/o8PgLUgte2k",
+                    "openai/whisper-large-v3",
+                    "mps",
+                    2,
+                    1,
+                    2,
+                ],
+            ],
+            fn=speaker_diarization,
+            inputs=[
+                youtube_url_path,
+                whisper_model_id,
+                device,
+                num_speakers,
+                min_speaker,
+                max_speaker,
+            ],
+            outputs=[output_text, output_audio],
+            cache_examples=True,
+        )
 gradio_app = gr.Blocks()
         with gr.Column():
             with gr.Tab(label="Youtube URL to Text"):
                 youtube_url_to_text_app()
+            with gr.Tab(label="Speaker Diarization"):
+                speaker_diarization_app()
 gradio_app.queue()
+gradio_app.launch(debug=True)