File size: 4,184 Bytes
7b34cad
 
 
1f8fa97
7b34cad
 
 
813e4b2
a9a8aec
7b34cad
 
574825e
7b34cad
 
8f2a46b
078e579
574825e
078e579
 
721ab04
8f2a46b
7b34cad
d6a23f4
7b34cad
 
a9a8aec
 
 
 
 
 
8f2a46b
7b34cad
721ab04
a9a8aec
 
e038230
a9a8aec
7b34cad
e038230
a9a8aec
1f8fa97
a9a8aec
 
7b34cad
a9a8aec
62fccb4
 
 
7b34cad
 
a9a8aec
55d67f9
a9a8aec
8f2a46b
3e64dd3
a9a8aec
 
7b34cad
a9a8aec
1f8fa97
7b34cad
721ab04
1f8fa97
a9a8aec
7b34cad
 
1f8fa97
7b34cad
 
 
 
3e64dd3
7b34cad
 
a9a8aec
 
7b34cad
1f8fa97
7b34cad
 
1f8fa97
7b34cad
a9a8aec
 
 
 
1f8fa97
7b34cad
 
721ab04
a9a8aec
7b34cad
 
 
a9a8aec
7b34cad
 
721ab04
 
 
574825e
721ab04
7b34cad
 
 
 
721ab04
7b34cad
 
 
 
 
 
 
 
721ab04
7b34cad
 
a3d95dd
8f2a46b
62fccb4
a3d95dd
3e64dd3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import spaces
import torch
import gradio as gr
import os
import uuid
import scipy.io.wavfile
import time  
import numpy as np
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, WhisperTokenizer, pipeline

device = "cuda" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
MODEL_NAME = "openai/whisper-large-v3-turbo"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    MODEL_NAME, 
    dtype=torch_dtype, 
    low_cpu_mem_usage=True, 
    use_safetensors=True, 
    attn_implementation="sdpa"
)
model.to(device)

processor = AutoProcessor.from_pretrained(MODEL_NAME)
tokenizer = WhisperTokenizer.from_pretrained(MODEL_NAME)

pipe = pipeline(
    task="automatic-speech-recognition",
    model=model,
    tokenizer=tokenizer,
    feature_extractor=processor.feature_extractor,
    chunk_length_s=10,
    device=device,
    ignore_warning=True,
)

@spaces.GPU
def stream_transcribe(stream, new_chunk):
    start_time = time.time() 
    try:
        sr, y = new_chunk
        
        if y.ndim > 1:
            y = y.mean(axis=1)
            
        y = y.astype(np.float32)
        max_val = np.max(np.abs(y))
        if max_val > 0:
            y /= max_val
    
        if stream is not None:
            stream = np.concatenate([stream, y])
        else:
            stream = y
            
        transcription = pipe({"sampling_rate": sr, "raw": stream})["text"]
        end_time = time.time()
        latency = end_time - start_time

        return stream, transcription, f"{latency:.2f}"
    except Exception as e:
        print(f"Error during Transcription: {e}")
        return stream, str(e), "Error"

@spaces.GPU
def transcribe(inputs, previous_transcription):
    start_time = time.time() 
    try:
        filename = f"{uuid.uuid4().hex}.wav"
        sample_rate, audio_data = inputs
        scipy.io.wavfile.write(filename, sample_rate, audio_data)

        transcription = pipe(filename)["text"]
        previous_transcription += transcription

        end_time = time.time()
        latency = end_time - start_time
        return previous_transcription, f"{latency:.2f}"
    except Exception as e:
        print(f"Error during Transcription: {e}")
        return previous_transcription, "Error"

def clear():
    return ""

def clear_state():
    return None

with gr.Blocks() as microphone:
    with gr.Column():
        gr.Markdown(f"# Realtime Whisper Large V3 Turbo\nTranscribe Audio in Realtime. This Demo uses the Checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers.")
        with gr.Row():
            input_audio_microphone = gr.Audio(streaming=True)
            output = gr.Textbox(label="Transcription", value="")
            latency_textbox = gr.Textbox(label="Latency (seconds)", value="0.0", scale=0)
        with gr.Row():
            clear_button = gr.Button("Clear Output")
        state = gr.State()
        input_audio_microphone.stream(
            stream_transcribe, 
            inputs=[state, input_audio_microphone], 
            outputs=[state, output, latency_textbox]
        )
        clear_button.click(clear_state, outputs=[state]).then(clear, outputs=[output])

with gr.Blocks() as file:
    with gr.Column():
        gr.Markdown(f"# Realtime Whisper Large V3 Turbo\nTranscribe Audio in Realtime. This Demo uses the Checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers.")
        with gr.Row():
            input_audio_microphone = gr.Audio(sources="upload", type="numpy")
            output = gr.Textbox(label="Transcription", value="")
            latency_textbox = gr.Textbox(label="Latency (seconds)", value="0.0", scale=0)
        with gr.Row():
            submit_button = gr.Button("Submit")
            clear_button = gr.Button("Clear Output")

        submit_button.click(transcribe, inputs=[input_audio_microphone, output], outputs=[output, latency_textbox])
        clear_button.click(clear, outputs=[output])

with gr.Blocks() as demo:
    gr.TabbedInterface([microphone, file], ["Microphone", "Transcribe from file"])

if __name__ == "__main__":
    demo.launch(share=True)