Spaces:
Build error
Build error
| import gradio as gr | |
| from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor | |
| import torch | |
| import torchaudio | |
| # Load pre-trained model and processor | |
| model_name = "facebook/wav2vec2-base-960h" | |
| processor = Wav2Vec2Processor.from_pretrained(model_name) | |
| model = Wav2Vec2ForCTC.from_pretrained(model_name) | |
| def speech_to_text(audio): | |
| try: | |
| if audio is None: | |
| return "No audio file provided." | |
| # Load audio file | |
| waveform, rate = torchaudio.load(audio) | |
| # Ensure the audio is mono | |
| if waveform.shape[0] > 1: | |
| waveform = torch.mean(waveform, dim=0, keepdim=True) | |
| # Resample to 16000 Hz | |
| resampler = torchaudio.transforms.Resample(orig_freq=rate, new_freq=16000) | |
| waveform = resampler(waveform) | |
| # Tokenize the waveform | |
| inputs = processor(waveform.squeeze().numpy(), return_tensors="pt", sampling_rate=16000) | |
| # Perform inference | |
| with torch.no_grad(): | |
| logits = model(**inputs).logits | |
| # Decode the output | |
| predicted_ids = torch.argmax(logits, dim=-1) | |
| transcription = processor.batch_decode(predicted_ids)[0] | |
| return transcription | |
| except Exception as e: | |
| return str(e) | |
| # Create Gradio interface | |
| iface = gr.Interface( | |
| fn=speech_to_text, | |
| inputs=gr.Audio(type="filepath", label="Input Audio"), | |
| outputs=gr.Textbox(label="Transcription"), | |
| live=True, | |
| title="Speech to Text", | |
| description="Speak into your microphone and get the transcribed text." | |
| ) | |
| # Launch the interface | |
| iface.launch() | |