Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import librosa | |
| import torch | |
| import torchaudio | |
| from datasets import load_dataset | |
| from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline | |
| import pandas as pd | |
| from sklearn.model_selection import train_test_split | |
| from noisereduce.torchgate import TorchGate as TG | |
| import re | |
| from pydub import AudioSegment | |
| from torchaudio.transforms import Resample | |
| import numpy as np | |
| def transcribe_audio(audio_file): | |
| audio = AudioSegment.from_wav(audio_file) | |
| device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") | |
| input_arr, sampling_rate =librosa.load(audio_file) | |
| # Create TorchGating instance | |
| tg = TG(sr=sampling_rate, nonstationary=True).to(device) | |
| try: | |
| input_arr = tg(input_arr) | |
| except: | |
| input_arr = input_arr | |
| if sampling_rate != 16000: | |
| input_arr = librosa.resample(input_arr, orig_sr=sampling_rate, target_sr=16000) | |
| MODEL_NAME = "rikeshsilwalekg/whisper-small-wer35-ekg" | |
| torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 | |
| model = AutoModelForSpeechSeq2Seq.from_pretrained( | |
| MODEL_NAME, torch_dtype=torch_dtype, use_safetensors=True | |
| ) | |
| model.to(device) | |
| processor = AutoProcessor.from_pretrained(MODEL_NAME) | |
| pipe = pipeline( | |
| "automatic-speech-recognition", | |
| model=model, | |
| tokenizer=processor.tokenizer, | |
| feature_extractor=processor.feature_extractor, | |
| max_new_tokens=128, | |
| chunk_length_s=30, | |
| batch_size=16, | |
| return_timestamps=False, | |
| torch_dtype=torch_dtype, | |
| device=device, | |
| ) | |
| # return_timestamps=True for sentence level timestaps | |
| # for word level timestamps return_timestamps="word" | |
| prediction = pipe(input_arr) | |
| prediction = prediction['text'] | |
| audio_input = gr.inputs.Audio(source="upload", type="filepath") | |
| iface = gr.Interface(fn=transcribe_audio, inputs=audio_input, | |
| outputs=["textbox"], title="Nepali Speech To Text", | |
| description="Upload an audio file and hit the 'Submit'\ | |
| button") | |
| iface.launch(inline=False) | |