File size: 4,184 Bytes
7b34cad 1f8fa97 7b34cad 813e4b2 a9a8aec 7b34cad 574825e 7b34cad 8f2a46b 078e579 574825e 078e579 721ab04 8f2a46b 7b34cad d6a23f4 7b34cad a9a8aec 8f2a46b 7b34cad 721ab04 a9a8aec e038230 a9a8aec 7b34cad e038230 a9a8aec 1f8fa97 a9a8aec 7b34cad a9a8aec 62fccb4 7b34cad a9a8aec 55d67f9 a9a8aec 8f2a46b 3e64dd3 a9a8aec 7b34cad a9a8aec 1f8fa97 7b34cad 721ab04 1f8fa97 a9a8aec 7b34cad 1f8fa97 7b34cad 3e64dd3 7b34cad a9a8aec 7b34cad 1f8fa97 7b34cad 1f8fa97 7b34cad a9a8aec 1f8fa97 7b34cad 721ab04 a9a8aec 7b34cad a9a8aec 7b34cad 721ab04 574825e 721ab04 7b34cad 721ab04 7b34cad 721ab04 7b34cad a3d95dd 8f2a46b 62fccb4 a3d95dd 3e64dd3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 | import spaces
import torch
import gradio as gr
import os
import uuid
import scipy.io.wavfile
import time
import numpy as np
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, WhisperTokenizer, pipeline
device = "cuda" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
MODEL_NAME = "openai/whisper-large-v3-turbo"
model = AutoModelForSpeechSeq2Seq.from_pretrained(
MODEL_NAME,
dtype=torch_dtype,
low_cpu_mem_usage=True,
use_safetensors=True,
attn_implementation="sdpa"
)
model.to(device)
processor = AutoProcessor.from_pretrained(MODEL_NAME)
tokenizer = WhisperTokenizer.from_pretrained(MODEL_NAME)
pipe = pipeline(
task="automatic-speech-recognition",
model=model,
tokenizer=tokenizer,
feature_extractor=processor.feature_extractor,
chunk_length_s=10,
device=device,
ignore_warning=True,
)
@spaces.GPU
def stream_transcribe(stream, new_chunk):
start_time = time.time()
try:
sr, y = new_chunk
if y.ndim > 1:
y = y.mean(axis=1)
y = y.astype(np.float32)
max_val = np.max(np.abs(y))
if max_val > 0:
y /= max_val
if stream is not None:
stream = np.concatenate([stream, y])
else:
stream = y
transcription = pipe({"sampling_rate": sr, "raw": stream})["text"]
end_time = time.time()
latency = end_time - start_time
return stream, transcription, f"{latency:.2f}"
except Exception as e:
print(f"Error during Transcription: {e}")
return stream, str(e), "Error"
@spaces.GPU
def transcribe(inputs, previous_transcription):
start_time = time.time()
try:
filename = f"{uuid.uuid4().hex}.wav"
sample_rate, audio_data = inputs
scipy.io.wavfile.write(filename, sample_rate, audio_data)
transcription = pipe(filename)["text"]
previous_transcription += transcription
end_time = time.time()
latency = end_time - start_time
return previous_transcription, f"{latency:.2f}"
except Exception as e:
print(f"Error during Transcription: {e}")
return previous_transcription, "Error"
def clear():
return ""
def clear_state():
return None
with gr.Blocks() as microphone:
with gr.Column():
gr.Markdown(f"# Realtime Whisper Large V3 Turbo\nTranscribe Audio in Realtime. This Demo uses the Checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers.")
with gr.Row():
input_audio_microphone = gr.Audio(streaming=True)
output = gr.Textbox(label="Transcription", value="")
latency_textbox = gr.Textbox(label="Latency (seconds)", value="0.0", scale=0)
with gr.Row():
clear_button = gr.Button("Clear Output")
state = gr.State()
input_audio_microphone.stream(
stream_transcribe,
inputs=[state, input_audio_microphone],
outputs=[state, output, latency_textbox]
)
clear_button.click(clear_state, outputs=[state]).then(clear, outputs=[output])
with gr.Blocks() as file:
with gr.Column():
gr.Markdown(f"# Realtime Whisper Large V3 Turbo\nTranscribe Audio in Realtime. This Demo uses the Checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers.")
with gr.Row():
input_audio_microphone = gr.Audio(sources="upload", type="numpy")
output = gr.Textbox(label="Transcription", value="")
latency_textbox = gr.Textbox(label="Latency (seconds)", value="0.0", scale=0)
with gr.Row():
submit_button = gr.Button("Submit")
clear_button = gr.Button("Clear Output")
submit_button.click(transcribe, inputs=[input_audio_microphone, output], outputs=[output, latency_textbox])
clear_button.click(clear, outputs=[output])
with gr.Blocks() as demo:
gr.TabbedInterface([microphone, file], ["Microphone", "Transcribe from file"])
if __name__ == "__main__":
demo.launch(share=True) |