File size: 1,383 Bytes
ddf4458
 
1c1fe8b
ddf4458
 
9c52375
b9c6a99
ddf4458
 
 
dcc8cf4
1c1fe8b
ddf4458
 
1c1fe8b
dcc8cf4
ddf4458
1c1fe8b
 
ddf4458
1c1fe8b
 
 
 
7658fb6
ddf4458
 
1c1fe8b
 
 
ddf4458
 
 
 
1c1fe8b
ddf4458
 
9c52375
 
ddf4458
 
9c52375
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
import gradio as gr
import torch
import torchaudio
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

# Load the model
model_id = "facebook/wav2vec2-base-960h"
processor = Wav2Vec2Processor.from_pretrained(model_id)
model = Wav2Vec2ForCTC.from_pretrained(model_id)

def transcribe(audio_file, progress=gr.Progress()):
    if audio_file is None:
        return "⚠️ No audio received."

    waveform, sample_rate = torchaudio.load(audio_file)

    if sample_rate != 16000:
        waveform = torchaudio.functional.resample(waveform, orig_freq=sample_rate, new_freq=16000)
        sample_rate = 16000

    if waveform.shape[0] > 1:
        waveform = waveform.mean(dim=0).unsqueeze(0)

    input_values = processor(waveform.squeeze().numpy(), sampling_rate=sample_rate, return_tensors="pt").input_values

    with torch.no_grad():
        logits = model(input_values).logits

    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(predicted_ids)[0]
    return transcription.lower()

demo = gr.Interface(
    fn=transcribe,
    inputs=gr.Audio(sources=["microphone"], type="filepath", label="🎤 Speak now"),
    outputs=gr.Textbox(label="📝 Transcription"),
    title="Wav2Vec2 Speech Transcription",
    description="Speak into the microphone and get a transcription using Wav2Vec2-base.",
    flagging_mode="never"
)

demo.launch()