| | |
| | import nltk |
| | import librosa |
| | import torch |
| | import gradio as gr |
| | from transformers import Wav2Vec2Tokenizer, Wav2Vec2ForCTC |
| | nltk.download("punkt") |
| |
|
| | |
| | model_name = "shizukanabasho/north2" |
| | tokenizer = Wav2Vec2Tokenizer.from_pretrained(model_name) |
| | model = Wav2Vec2ForCTC.from_pretrained(model_name) |
| |
|
| | def load_data(input_file): |
| |
|
| | |
| | speech, sample_rate = librosa.load(input_file) |
| | |
| | if len(speech.shape) > 1: |
| | speech = speech[:,0] + speech[:,1] |
| | |
| | if sample_rate !=16000: |
| | speech = librosa.resample(speech, sample_rate,16000) |
| | return speech |
| | |
| | def correct_casing(input_sentence): |
| |
|
| | sentences = nltk.sent_tokenize(input_sentence) |
| | return (''.join([s.replace(s[0],s[0].capitalize(),1) for s in sentences])) |
| | |
| | def asr_transcript(input_file): |
| |
|
| | speech = load_data(input_file) |
| | |
| | input_values = tokenizer(speech, return_tensors="pt").input_values |
| | |
| | logits = model(input_values).logits |
| | |
| | predicted_ids = torch.argmax(logits, dim=-1) |
| | |
| | transcription = tokenizer.decode(predicted_ids[0]) |
| | |
| | |
| | return transcription |
| |
|
| | gr.Interface(asr_transcript, |
| | inputs = gr.inputs.Audio(source="microphone", type="filepath", optional=True, label="Speaker"), |
| | outputs = gr.outputs.Textbox(label="Output Text"), |
| | title="ASR using Wav2Vec2.0", |
| | description = "This application displays transcribed text for given audio input", |
| | theme="grass").launch() |