Spaces:
Sleeping
Sleeping
File size: 1,383 Bytes
ddf4458 1c1fe8b ddf4458 9c52375 b9c6a99 ddf4458 dcc8cf4 1c1fe8b ddf4458 1c1fe8b dcc8cf4 ddf4458 1c1fe8b ddf4458 1c1fe8b 7658fb6 ddf4458 1c1fe8b ddf4458 1c1fe8b ddf4458 9c52375 ddf4458 9c52375 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 | import gradio as gr
import torch
import torchaudio
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
# Load the model
model_id = "facebook/wav2vec2-base-960h"
processor = Wav2Vec2Processor.from_pretrained(model_id)
model = Wav2Vec2ForCTC.from_pretrained(model_id)
def transcribe(audio_file, progress=gr.Progress()):
if audio_file is None:
return "⚠️ No audio received."
waveform, sample_rate = torchaudio.load(audio_file)
if sample_rate != 16000:
waveform = torchaudio.functional.resample(waveform, orig_freq=sample_rate, new_freq=16000)
sample_rate = 16000
if waveform.shape[0] > 1:
waveform = waveform.mean(dim=0).unsqueeze(0)
input_values = processor(waveform.squeeze().numpy(), sampling_rate=sample_rate, return_tensors="pt").input_values
with torch.no_grad():
logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)[0]
return transcription.lower()
demo = gr.Interface(
fn=transcribe,
inputs=gr.Audio(sources=["microphone"], type="filepath", label="🎤 Speak now"),
outputs=gr.Textbox(label="📝 Transcription"),
title="Wav2Vec2 Speech Transcription",
description="Speak into the microphone and get a transcription using Wav2Vec2-base.",
flagging_mode="never"
)
demo.launch() |