Spaces:
Sleeping
Sleeping
| import subprocess | |
| subprocess.run(["pip", "install", "gradio", "--upgrade"]) | |
| subprocess.run(["pip", "install", "transformers"]) | |
| subprocess.run(["pip", "install", "torchaudio", "--upgrade"]) | |
| import gradio as gr | |
| from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor | |
| import torchaudio | |
| # Load model and processor | |
| processor = Wav2Vec2Processor.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-italian") | |
| model = Wav2Vec2ForCTC.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-italian") | |
| # Function to perform ASR on audio data | |
| def transcribe_audio(audio_data): | |
| # Convert audio data to mono and normalize | |
| audio_data = torchaudio.transforms.Resample(audio_data[1], 16000)(audio_data[0]) | |
| audio_data = torchaudio.functional.gain(audio_data, gain_db=5.0) | |
| # Apply custom preprocessing to the audio data if needed | |
| input_values = processor(audio_data[0].numpy(), return_tensors="pt").input_values | |
| # Perform ASR | |
| with torch.no_grad(): | |
| logits = model(input_values).logits | |
| # Decode the output | |
| predicted_ids = torch.argmax(logits, dim=-1) | |
| transcription = processor.batch_decode(predicted_ids) | |
| return transcription[0] | |
| # Create Gradio interface | |
| audio_input = gr.Audio() | |
| gr.Interface(fn=transcribe_audio, inputs=audio_input, outputs="text").launch() | |