import gradio as gr from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC import torch # Load model and processor once processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h-lv60-self") model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h-lv60-self") def transcribe(audio): # audio is a tuple: (sample_rate, numpy_array) sample_rate, waveform = audio # Convert waveform to float32 and process input_values = processor(waveform, sampling_rate=sample_rate, return_tensors="pt", padding="longest").input_values with torch.no_grad(): logits = model(input_values).logits predicted_ids = torch.argmax(logits, dim=-1) transcription = processor.batch_decode(predicted_ids) return transcription[0] # Build Gradio interface iface = gr.Interface( fn=transcribe, inputs=gr.Audio(source="microphone", type="numpy"), outputs="text", title="Wav2Vec2 ASR", description="Record or upload audio, and get transcription using Wav2Vec2 large model." ) iface.launch()