Spaces:

mozilla-ai
/

speech-to-text-alignment

Running

App Files Files Community

kostissz commited on Apr 17

Commit

dcbed68

verified ·

1 Parent(s): d11c040

Add init demo version

Browse files

Files changed (1) hide show

app.py +131 -0

app.py ADDED Viewed

	@@ -0,0 +1,131 @@

+import csv
+from pathlib import Path
+from typing import Tuple
+import gradio as gr
+from transformers import WhisperProcessor, WhisperForConditionalGeneration
+from whisper_bidec import decode_wav, get_logits_processor, load_corpus_from_sentences
+def _parse_file(file_path: str) -> list[str]:
+    """Parse .txt / .md / .csv and return its content as a list of strings by splitting per new line or row."""
+    if file_path.endswith(".csv"):
+        sentences = []
+        with open(file_path, "r", encoding="utf-8") as f:
+            reader = csv.reader(f)
+            for row in reader:
+                sentences.append(row)
+    else:
+        with open(file_path, "r") as f:
+            sentences = f.readlines()
+    return sentences
+def transcribe(
+    processor_name: str,
+    audio: str,
+    bias_strength: float,
+    bias_text: str | None,
+    bias_text_file: str | None,
+) -> Tuple[str, str]:
+    processor = WhisperProcessor.from_pretrained(processor_name)
+    model = WhisperForConditionalGeneration.from_pretrained(processor_name)
+    sentences = ""
+    if bias_text:
+        sentences = bias_text.split(",")
+    elif Path(bias_text_file).is_file():
+        sentences = _parse_file(bias_text_file)
+    if sentences:
+        corpus = load_corpus_from_sentences(sentences, processor)
+        logits_processor = get_logits_processor(
+            corpus=corpus, processor=processor, bias_towards_lm=bias_strength
+        )
+        text_with_bias = decode_wav(
+            model, processor, audio, logits_processor=logits_processor
+        )
+    else:
+        text_with_bias = ""
+    text_no_bias = decode_wav(model, processor, audio, logits_processor=None)
+    return text_no_bias, text_with_bias
+def setup_gradio_demo():
+    css = """
+    #centered-column {
+        display: flex;
+        justify-content: center;
+        align-items: center;
+        flex-direction: column;
+        text-align: center;
+    }
+    """
+    with gr.Blocks(css=css) as demo:
+        gr.Markdown("# Whisper Bidec Demo")
+        gr.Markdown("## Step 1: Select a Whisper model")
+        processor = gr.Textbox(
+            value="openai/whisper-tiny.en", label="Whisper Model from Hugging Face"
+        )
+        gr.Markdown("## Step 2: Upload your audio file")
+        audio_clip = gr.Audio(type="filepath", label="Upload a WAV file")
+        gr.Markdown("## Step 3: Set your biasing text")
+        with gr.Row():
+            with gr.Column(scale=20):
+                gr.Markdown(
+                    "You can add multiple possible sentences by separating them with a comma <,>."
+                )
+                bias_text = gr.Textbox(label="Write your biasing text here")
+            with gr.Column(scale=1, elem_id="centered-column"):
+                gr.Markdown("## OR")
+            with gr.Column(scale=20):
+                gr.Markdown(
+                    "Note that each new line (.txt / .md) or row (.csv) will be treated as a separate sentence to bias towards to."
+                )
+                bias_text_file = gr.File(
+                    label="Upload a file with multiple lines of text",
+                    file_types=[".txt", ".md", ".csv"],
+                )
+        gr.Markdown("## Step 4: Set how much you want to bias towards the LM")
+        bias_amount = gr.Slider(
+            minimum=0.0,
+            maximum=1.0,
+            value=0.5,
+            step=0.1,
+            label="Bias strength",
+            interactive=True,
+        )
+        gr.Markdown("## Step 5: Get your transcription before and after biasing")
+        transcribe_button = gr.Button("Transcribe")
+        with gr.Row():
+            with gr.Column():
+                output = gr.Text(label="Output")
+            with gr.Column():
+                biased_output = gr.Text(label="Biased output")
+        transcribe_button.click(
+            fn=transcribe,
+            inputs=[
+                processor,
+                audio_clip,
+                bias_amount,
+                bias_text,
+                bias_text_file,
+            ],
+            outputs=[output, biased_output],
+        )
+    demo.launch()
+if __name__ == "__main__":
+    setup_gradio_demo()