import gradio as gr import torch from transformers import HubertForCTC, Wav2Vec2Processor import librosa # Load the model and processor from Hugging Face Hub model_name = "Ansu/mHubert-basque-ASR" # Change this to your model processor = Wav2Vec2Processor.from_pretrained(model_name) model = HubertForCTC.from_pretrained(model_name) # Function to transcribe audio def transcribe(audio): # Load audio file audio, _ = librosa.load(audio, sr=16000) # Process input inputs = processor(audio, sampling_rate=16000, return_tensors="pt", padding=True) # Get model predictions with torch.no_grad(): logits = model(inputs.input_values).logits predicted_ids = torch.argmax(logits, dim=-1) # Decode predictions transcription = processor.batch_decode(predicted_ids)[0] return transcription # Create Gradio interface iface = gr.Interface( fn=transcribe, inputs=gr.Audio(sources=["upload", "microphone"], type="filepath", label="🎤 Upload or Record Audio"), outputs="text", title="HuBERT ASR Demo", description="🎙️ Speak into the microphone or upload an audio file to get a transcription.", live=True, # Enables real-time recording ) iface.launch()